From bb02f5cc959ad1ee870a70614ef577129d3cc1a9 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 16:27:45 -0800
Subject: [PATCH 01/15] preparing to refactor rapids.sh into a template

---
 rapids/rapids.sh => templates/rapids/rapids.sh.in | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename rapids/rapids.sh => templates/rapids/rapids.sh.in (100%)

diff --git a/rapids/rapids.sh b/templates/rapids/rapids.sh.in
similarity index 100%
rename from rapids/rapids.sh
rename to templates/rapids/rapids.sh.in

From d46cadf8e3e79518f6c7ea601f7d5b2c267663d2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 23:05:18 -0800
Subject: [PATCH 02/15] refactored to be nearer in shape to template output

---
 templates/rapids/rapids.sh.in | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 6c5c9d411..3fd48089e 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -69,9 +69,6 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
-function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
-function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }
-
 function execute_with_retries() {
   local -r cmd="$*"
   for i in {0..9} ; do
@@ -83,6 +80,15 @@ function execute_with_retries() {
   return 1
 }
 
+function restart_knox() {
+  systemctl stop knox
+  rm -rf "${KNOX_HOME}/data/deployments/*"
+  systemctl start knox
+}
+
+function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
+function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }
+
 function configure_dask_yarn() {
   readonly DASK_YARN_CONFIG_DIR=/etc/dask/
   readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
@@ -201,12 +207,6 @@ function install_systemd_dask_service() {
   install_systemd_dask_worker
 }
 
-function restart_knox() {
-  systemctl stop knox
-  rm -rf "${KNOX_HOME}/data/deployments/*"
-  systemctl start knox
-}
-
 function configure_knox_for_dask() {
   if [[ ! -d "${KNOX_HOME}" ]]; then
     echo "Skip configuring Knox rules for Dask"

From d6be3a4b41342a96eb5b426052f67e89fea93f1e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Mon, 6 Jan 2025 23:47:57 -0800
Subject: [PATCH 03/15] included templates from which rapids.sh is built and
 instructions in presubmit.sh for generating actions

---
 cloudbuild/presubmit.sh                 |  10 +-
 integration_tests/dataproc_test_case.py |  26 +-
 templates/common/template_disclaimer    |   5 +
 templates/common/util_functions         | 687 ++++++++++++++++++++++++
 templates/dask/util_functions           | 555 +++++++++++++++++++
 templates/generate-action.pl            |  24 +
 templates/gpu/util_functions            | 220 ++++++++
 templates/legal/license_header          |  13 +
 templates/rapids/rapids.sh.in           | 644 +---------------------
 9 files changed, 1555 insertions(+), 629 deletions(-)
 create mode 100644 templates/common/template_disclaimer
 create mode 100644 templates/common/util_functions
 create mode 100644 templates/dask/util_functions
 create mode 100644 templates/generate-action.pl
 create mode 100644 templates/gpu/util_functions
 create mode 100644 templates/legal/license_header

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index eec7adb76..fc664f1bf 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -48,8 +48,14 @@ initialize_git_repo() {
 # to determine all changed files and looks for tests in directories with changed files.
 determine_tests_to_run() {
   # Infer the files that changed
+  mapfile -t CHANGED_ACTION_TEMPLATES < <(git diff origin/master --name-only | grep 'templates/.*/.*\.sh\.in')
+  for tt in "${CHANGED_ACTION_TEMPLATES[@]}"; do
+    local genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"`
+    perl templates/generate-action.pl "${genfile}" > "${genfile}"
+  done
+
   mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
-  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only)
+  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template)
   echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}"
   echo "Changed files: ${CHANGED_FILES[*]}"
 
@@ -70,6 +76,7 @@ determine_tests_to_run() {
     changed_dir="${changed_dir%%/*}/"
     # Run all tests if common directories modified
     if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then
+      continue
       echo "All tests will be run: '${changed_dir}' was changed"
       TESTS_TO_RUN=(":DataprocInitActionsTestSuite")
       return 0
@@ -104,7 +111,6 @@ run_tests() {
   bazel test \
     --jobs="${max_parallel_tests}" \
     --local_test_jobs="${max_parallel_tests}" \
-    --flaky_test_attempts=3 \
     --action_env="INTERNAL_IP_SSH=true" \
     --test_output="all" \
     --noshow_progress \
diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 936718498..683109125 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -7,6 +7,8 @@
 import string
 import subprocess
 import sys
+import time
+import random
 from threading import Timer
 
 import pkg_resources
@@ -123,7 +125,7 @@ def createCluster(self,
 
         for i in init_actions:
             if "install_gpu_driver.sh" in i or "horovod.sh" in i or \
-                     "dask-rapids.sh"  in i or "mlvm.sh"    in i or \
+                           "rapids.sh" in i or "mlvm.sh"    in i or \
                      "spark-rapids.sh" in i:
                 args.append("--no-shielded-secure-boot")
 
@@ -287,10 +289,24 @@ def assert_instance_command(self,
             AssertionError: if command returned non-0 exit code.
         """
 
-        ret_code, stdout, stderr = self.assert_command(
-            'gcloud compute ssh {} --zone={} --command="{}"'.format(
-                instance, self.cluster_zone, cmd), timeout_in_minutes)
-        return ret_code, stdout, stderr
+        retry_count = 5
+
+        ssh_cmd='gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60'.format(
+          instance, self.cluster_zone, cmd)
+
+        while retry_count > 0:
+          try:
+            ret_code, stdout, stderr = self.assert_command(
+                ssh_cmd, timeout_in_minutes )
+            return ret_code, stdout, stderr
+          except Exception as e:
+            print("An error occurred: ", e)
+            retry_count -= 1
+            if retry_count > 0:
+              time.sleep( 3 + random.randint(1, 10) )
+              continue
+            else:
+              raise
 
     def assert_dataproc_job(self,
                             cluster_name,
diff --git a/templates/common/template_disclaimer b/templates/common/template_disclaimer
new file mode 100644
index 000000000..3b417deff
--- /dev/null
+++ b/templates/common/template_disclaimer
@@ -0,0 +1,5 @@
+# This initialization action is generated from
+# initialization-actions/templates/[% template_path %]
+#
+# Modifications made directly to the generated file will be lost when
+# the template is re-evaluated
diff --git a/templates/common/util_functions b/templates/common/util_functions
new file mode 100644
index 000000000..0f0bfeaa6
--- /dev/null
+++ b/templates/common/util_functions
@@ -0,0 +1,687 @@
+function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
+function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
+
+# For version (or real number) comparison
+# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second
+# ( version_ge 2.0 2.1 ) evaluates to false
+# ( version_ge 2.2 2.1 ) evaluates to true
+function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
+function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
+function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
+function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
+
+function define_os_comparison_functions() {
+
+  readonly -A supported_os=(
+    ['debian']="10 11 12"
+    ['rocky']="8 9"
+    ['ubuntu']="18.04 20.04 22.04"
+  )
+
+  # dynamically define OS version test utility functions
+  if [[ "$(os_id)" == "rocky" ]];
+  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+  else _os_version="$(os_version)"; fi
+  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+    done
+  done
+  eval "function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )"
+}
+
+function os_vercat()   ( set +x
+  if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
+  elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
+                   else os_version ; fi ; )
+
+function repair_old_backports {
+  if ! is_debuntu ; then return ; fi
+  # This script uses 'apt-get update' and is therefore potentially dependent on
+  # backports repositories which have been archived.  In order to mitigate this
+  # problem, we will use archive.debian.org for the oldoldstable repo
+
+  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
+  debdists="https://deb.debian.org/debian/dists"
+  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
+  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
+  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');
+
+  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )
+
+  for filename in "${matched_files[@]}"; do
+    # Fetch from archive.debian.org for ${oldoldstable}-backports
+    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
+                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
+  done
+}
+
+function print_metadata_value() {
+  local readonly tmpfile=$(mktemp)
+  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
+    -s -o ${tmpfile} 2>/dev/null)
+  local readonly return_code=$?
+  # If the command completed successfully, print the metadata value to stdout.
+  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
+    cat ${tmpfile}
+  fi
+  rm -f ${tmpfile}
+  return ${return_code}
+}
+
+function print_metadata_value_if_exists() {
+  local return_code=1
+  local readonly url=$1
+  print_metadata_value ${url}
+  return_code=$?
+  return ${return_code}
+}
+
+# replicates /usr/share/google/get_metadata_value
+function get_metadata_value() (
+  set +x
+  local readonly varname=$1
+  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
+  # Print the instance metadata value.
+  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
+  return_code=$?
+  # If the instance doesn't have the value, try the project.
+  if [[ ${return_code} != 0 ]]; then
+    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
+    return_code=$?
+  fi
+
+  return ${return_code}
+)
+
+function get_metadata_attribute() (
+  set +x
+  local -r attribute_name="$1"
+  local -r default_value="${2:-}"
+  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
+)
+
+function execute_with_retries() (
+  set +x
+  local -r cmd="$*"
+
+  if [[ "$cmd" =~ "^apt-get install" ]] ; then
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+  fi
+  for ((i = 0; i < 3; i++)); do
+    set -x
+    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    set +x
+    if [[ $retval == 0 ]] ; then return 0 ; fi
+    sleep 5
+  done
+  return 1
+)
+
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
+
+  while ! command -v gcloud ; do sleep 5s ; done
+
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
+
+function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  fi
+}
+
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
+}
+
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
+
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
+
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
+
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
+
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
+
+
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
+
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
+
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
+
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
+
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+
+}
+
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
+
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
+
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
+
+  export NO_PROXY="${no_proxy}"
+}
+
+function is_ramdisk() {
+  if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi
+  if   ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0
+  elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi
+
+  if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then
+    IS_RAMDISK="true"
+    return 0
+  else
+    IS_RAMDISK="false"
+    return 1
+  fi
+}
+
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+
+  # Write to a ramdisk instead of churning the persistent disk
+
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}/pkgs_dirs"
+  mount -t tmpfs tmpfs "${tmpdir}"
+
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs"
+
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+  is_ramdisk -f
+}
+
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
+
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
+  fi
+
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
+}
+
+function configure_dkms_certs() {
+  if test -v PSN && [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping";
+      return 0
+  fi
+
+  mkdir -p "${CA_TMPDIR}"
+
+  # If the private key exists, verify it
+  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
+    echo "Private key material exists"
+
+    local expected_modulus_md5sum
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
+    if [[ -n "${expected_modulus_md5sum}" ]]; then
+      modulus_md5sum="${expected_modulus_md5sum}"
+
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
+
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
+    fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+
+    return
+  fi
+
+  # Retrieve cloud secrets keys
+  local sig_priv_secret_name
+  sig_priv_secret_name="${PSN}"
+  local sig_pub_secret_name
+  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
+  local sig_secret_project
+  sig_secret_project="$(get_metadata_attribute secret_project)"
+  local sig_secret_version
+  sig_secret_version="$(get_metadata_attribute secret_version)"
+
+  # If metadata values are not set, do not write mok keys
+  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi
+
+  # Write private material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_priv_secret_name}" \
+      | dd status=none of="${CA_TMPDIR}/db.rsa"
+
+  # Write public material to volatile storage
+  gcloud secrets versions access "${sig_secret_version}" \
+         --project="${sig_secret_project}" \
+         --secret="${sig_pub_secret_name}" \
+      | base64 --decode \
+      | dd status=none of="${CA_TMPDIR}/db.der"
+
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
+}
+
+function clear_dkms_key {
+  if [[ -z "${PSN}" ]]; then
+      echo "No signing secret provided.  skipping" >&2
+      return 0
+  fi
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
+}
+
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
+
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster."
+    return
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Consider either disabling secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return
+  fi
+
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
+
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
+}
+
+function restart_knox() {
+  systemctl stop knox
+  rm -rf "${KNOX_HOME}/data/deployments/*"
+  systemctl start knox
+}
+
+function is_complete() {
+  phase="$1"
+  test -f "${workdir}/complete/${phase}"
+}
+
+function mark_complete() {
+  phase="$1"
+  touch "${workdir}/complete/${phase}"
+}
+
+function mark_incomplete() {
+  phase="$1"
+  rm -f "${workdir}/complete/${phase}"
+}
+
+function install_dependencies() {
+  is_complete install-dependencies && return 0
+
+  pkg_list="screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+  mark_complete install-dependencies
+}
+
+function prepare_pip_env() {
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv"
+  source "${workdir}/python-venv/bin/activate"
+
+  pip cache purge || echo "unable to purge pip cache"
+  if is_ramdisk ; then
+    # Download pip packages to tmpfs
+    mkdir -p "${tmpdir}/cache-dir"
+    pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir"
+  fi
+}
+
+function prepare_conda_env() {
+  CONDA=/opt/conda/miniconda3/bin/conda
+  touch ~/.condarc
+  cp ~/.condarc ~/.condarc.default
+  if is_ramdisk ; then
+    # Download conda packages to tmpfs
+    mkdir -p "${tmpdir}/conda_cache"
+    ${CONDA} config --add pkgs_dirs "${tmpdir}/conda_cache"
+  fi
+}
+
+function prepare_common_env() {
+  define_os_comparison_functions
+
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
+
+  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+
+  # Dataproc configurations
+  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+  readonly HIVE_CONF_DIR='/etc/hive/conf'
+  readonly SPARK_CONF_DIR='/etc/spark/conf'
+
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
+
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
+
+  # master node
+  MASTER="$(get_metadata_attribute dataproc-master)"
+  readonly MASTER
+
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
+
+  # Knox config
+  readonly KNOX_HOME=/usr/lib/knox
+
+  mkdir -p "${workdir}/complete"
+  set_proxy
+  mount_ramdisk
+
+  readonly install_log="${tmpdir}/install.log"
+
+  is_complete prepare.common && return
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+    if is_ubuntu ; then
+      while ! command -v gcloud ; do sleep 5s ; done
+    fi
+  else
+    dnf clean all
+  fi
+
+  # When creating a disk image:
+  if [[ -n "$(get_metadata_attribute creating-image "")" ]]; then
+    df / > "/run/disk-usage.log"
+
+  # zero free disk space
+  ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  )
+
+    install_dependencies
+
+    # Monitor disk usage in a screen session
+    touch "/run/keep-running-df"
+    screen -d -m -LUS keep-running-df \
+      bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+ fi
+
+  mark_complete prepare.common
+}
+
+function pip_exit_handler() {
+  if is_ramdisk ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+  fi
+}
+
+function conda_exit_handler() {
+  mv ~/.condarc.default ~/.condarc
+}
+
+function common_exit_handler() {
+  set +ex
+  echo "Exit handler invoked"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if is_ramdisk ; then
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
+  fi
+
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
+
+  # When creating image, print disk usage statistics, zero unused disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    # print disk usage statistics for large components
+    if is_ubuntu ; then
+      du -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+        /usr/lib \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3 | sort -h
+    elif is_debian ; then
+      du -x -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
+        /var/lib/{docker,mysql,} \
+        /opt/nvidia/* \
+        /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+        /usr/bin \
+        /usr \
+        /var \
+        / 2>/dev/null | sort -h
+    else
+      du -hs \
+        /var/lib/docker \
+        /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
+        /usr/lib64/google-cloud-sdk \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3
+    fi
+
+    # Process disk usage logs from installation period
+    rm -f /run/keep-running-df
+    sync
+    sleep 5.01s
+    # compute maximum size of disk during installation
+    # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+    df / | tee -a "/run/disk-usage.log"
+
+    perl -e \
+          '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
+print( "    samples-taken: ", scalar @siz, $/,
+       "starting-disk-used: $starting", $/,
+       "maximum-disk-used:  $max", $/,
+       "minimum-disk-used:  $min", $/,
+       "     increased-by:  $inc", $/ )' < "/run/disk-usage.log"
+
+
+    # zero free disk space
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
+  fi
+  echo "exit_handler has completed"
+}
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
new file mode 100644
index 000000000..d67da1fc1
--- /dev/null
+++ b/templates/dask/util_functions
@@ -0,0 +1,555 @@
+function configure_dask_yarn() {
+  readonly DASK_YARN_CONFIG_DIR=/etc/dask/
+  readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
+  # Minimal custom configuration is required for this
+  # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage
+  # for information on tuning Dask-Yarn environments.
+  mkdir -p "${DASK_YARN_CONFIG_DIR}"
+
+  local worker_class="dask.distributed.Nanny"
+  local gpu_count="0"
+  if command -v nvidia-smi ; then
+    gpu_count="1"
+    worker_class="dask_cuda.CUDAWorker"
+  fi
+
+  cat <<EOF >"${DASK_YARN_CONFIG_FILE}"
+# Config file for Dask Yarn.
+#
+# These values are joined on top of the default config, found at
+# https://yarn.dask.org/en/latest/configuration.html#default-configuration
+
+yarn:
+  environment: python://${DASK_CONDA_ENV}/bin/python
+
+  worker:
+    count: 2
+    gpus: ${gpu_count}
+    worker_class: ${worker_class}
+EOF
+}
+
+function install_systemd_dask_worker() {
+  echo "Installing systemd Dask Worker service..."
+  local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}"
+
+  mkdir -p "${dask_worker_local_dir}"
+
+  local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh"
+
+  local compute_mode_cmd=""
+  if command -v nvidia-smi ; then compute_mode_cmd="nvidia-smi --compute-mode=DEFAULT" ; fi
+  local worker_name="dask worker"
+  if test -f "${DASK_CONDA_ENV}/bin/dask-cuda" ; then worker_name="dask-cuda worker" ; fi
+  local worker="${DASK_CONDA_ENV}/bin/${worker_name}"
+  cat <<EOF >"${DASK_WORKER_LAUNCHER}"
+#!/bin/bash
+LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
+${compute_mode_cmd}
+echo "${worker_name} starting, logging to \${LOGFILE}"
+${worker} --local-directory="${dask_worker_local_dir}" --memory-limit=auto "${MASTER}:8786" >> "\${LOGFILE}" 2>&1
+EOF
+
+  chmod 750 "${DASK_WORKER_LAUNCHER}"
+
+  local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service"
+  cat <<EOF >"${dask_service_file}"
+[Unit]
+Description=Dask Worker Service
+[Service]
+Type=simple
+Restart=on-failure
+ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}'
+[Install]
+WantedBy=multi-user.target
+EOF
+  chmod a+r "${dask_service_file}"
+
+  systemctl daemon-reload
+
+  # Enable the service
+  enable_systemd_dask_worker_service="0"
+  if [[ "${ROLE}" != "Master" ]]; then
+    enable_systemd_dask_worker_service="1"
+  else
+    # Enable service on single-node cluster (no workers)
+    local worker_count="$(get_metadata_attribute dataproc-worker-count)"
+    if ( [[ "${worker_count}" == "0" ]] ||
+         ( [[ "$(get_metadata_attribute dask-cuda-worker-on-master 'true')" == "true" ]] &&
+           [[ "$(get_metadata_attribute dask-worker-on-master 'true')"      == "true" ]] )
+       ) ; then
+      enable_systemd_dask_worker_service="1"
+    fi
+  fi
+  readonly enable_systemd_dask_worker_service
+
+  if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
+    systemctl enable "${DASK_WORKER_SERVICE}"
+    systemctl restart "${DASK_WORKER_SERVICE}"
+  fi
+}
+
+function install_systemd_dask_scheduler() {
+  # only run scheduler on primary master
+  if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi
+  echo "Installing systemd Dask Scheduler service..."
+  local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}"
+
+  mkdir -p "${dask_scheduler_local_dir}"
+
+  local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh"
+
+  cat <<EOF >"${DASK_SCHEDULER_LAUNCHER}"
+#!/bin/bash
+LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
+echo "dask scheduler starting, logging to \${LOGFILE}"
+${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
+EOF
+
+  chmod 750 "${DASK_SCHEDULER_LAUNCHER}"
+
+  local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service"
+  cat <<EOF >"${dask_service_file}"
+[Unit]
+Description=Dask Scheduler Service
+[Service]
+Type=simple
+Restart=on-failure
+ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}'
+[Install]
+WantedBy=multi-user.target
+EOF
+  chmod a+r "${dask_service_file}"
+
+  systemctl daemon-reload
+
+  # Enable the service
+  systemctl enable "${DASK_SCHEDULER_SERVICE}"
+}
+
+function install_systemd_dask_service() {
+  install_systemd_dask_scheduler
+  install_systemd_dask_worker
+}
+
+function start_systemd_dask_service() {
+  # only run scheduler on primary master
+  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
+    date
+    time systemctl start "${DASK_SCHEDULER_SERVICE}"
+    local substate_val="$(systemctl show ${DASK_SCHEDULER_SERVICE} -p SubState --value)"
+    if [[ "${substate_val}" != 'running' ]] ; then
+      cat "/var/log/${DASK_SCHEDULER_SERVICE}.log"
+    fi
+    systemctl status "${DASK_SCHEDULER_SERVICE}"
+  fi
+
+  echo "Starting Dask 'standalone' cluster..."
+  if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
+    date
+    # Pause while scheduler comes online
+    retries=30
+    while ! nc -vz "${MASTER}" 8786 ; do
+      sleep 3s
+      ((retries--))
+      if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi
+    done
+    time systemctl start "${DASK_WORKER_SERVICE}"
+    local substate_val="$(systemctl show ${DASK_WORKER_SERVICE} -p SubState --value)"
+    if [[ "${substate_val}" != 'running' ]] ; then
+      cat "/var/log/${DASK_WORKER_SERVICE}.log"
+    fi
+    systemctl status "${DASK_WORKER_SERVICE}"
+  fi
+
+  date
+}
+
+function configure_knox_for_dask() {
+  if [[ ! -d "${KNOX_HOME}" ]]; then
+    echo "Skip configuring Knox rules for Dask"
+    return 0
+  fi
+
+  local DASK_UI_PORT=8787
+  if [[ -f /etc/knox/conf/topologies/default.xml ]]; then
+    sed -i \
+      "/<\/topology>/i <service><role>DASK<\/role><url>http://localhost:${DASK_UI_PORT}<\/url><\/service> <service><role>DASKWS<\/role><url>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \
+      /etc/knox/conf/topologies/default.xml
+  fi
+
+  mkdir -p "${KNOX_DASK_DIR}"
+
+  cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<service role="DASK" name="dask" version="0.1.0">
+  <policies>
+    <policy role="webappsec"/>
+    <policy role="authentication" name="Anonymous"/>
+    <policy role="rewrite"/>
+    <policy role="authorization"/>
+  </policies>
+
+  <routes>
+    <!-- Javascript paths -->
+    <route path="/dask/**/*.js">
+      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
+    </route>
+    <route path="/dask/**/*.js?**">
+      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
+    </route>
+
+    <!-- CSS paths -->
+    <route path="/dask/**/*.css">
+      <rewrite apply="DASK/dask/inbound/css/dask" to="request.url"/>
+    </route>
+
+    <!-- General path routing -->
+    <route path="/dask">
+      <rewrite apply="DASK/dask/inbound/root" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
+    </route>
+    <route path="/dask/**">
+      <rewrite apply="DASK/dask/inbound/root/path" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
+      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
+    </route>
+    <route path="/dask/**?**">
+      <rewrite apply="DASK/dask/inbound/root/query" to="request.url"/>
+      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
+      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
+    </route>
+  </routes>
+  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
+</service>
+EOF
+
+  cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<rules>
+  <rule dir="IN" name="DASK/dask/inbound/js/dask" pattern="http://*:*/**/dask/{**}?{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/root" pattern="http://*:*/**/dask">
+    <rewrite template="{$serviceUrl[DASK]}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/root/path" pattern="http://*:*/**/dask/{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/root/query" pattern="http://*:*/**/dask/{**}?{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
+  </rule>
+  <rule dir="IN" name="DASK/dask/inbound/css/dask" pattern="http://*:*/**/dask/{**}?{**}">
+    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
+  </rule>
+  <!-- without the /gateway/default prefix -->
+  <rule dir="IN" name="DASK/dask/inbound/root/noprefix" pattern="http://*:*/dask">
+    <rewrite template="{$serviceUrl[DASK]}"/>
+  </rule>
+
+  <rule dir="OUT" name="DASK/dask/outbound/logs" pattern="/logs">
+    <rewrite template="{$frontend[path]}/dask/info/logs"/>
+  </rule>
+
+  <!-- Rewrite redirect responses Location header -->
+  <filter name="DASK/dask/outbound/headers">
+    <content type="application/x-http-headers">
+      <apply path="Location" rule="DASK/dask/outbound/headers/location"/>
+    </content>
+  </filter>
+
+  <rule dir="OUT" name="DASK/dask/outbound/headers/location" flow="OR">
+    <match pattern="*://*:*/">
+      <rewrite template="{$frontend[path]}/dask/"/>
+    </match>
+    <match pattern="*://*:*/{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}"/>
+    </match>
+    <match pattern="*://*:*/{**}?{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
+    </match>
+    <match pattern="/{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}"/>
+    </match>
+    <match pattern="/{**}?{**}">
+      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
+    </match>
+  </rule>
+</rules>
+EOF
+
+  mkdir -p "${KNOX_DASKWS_DIR}"
+
+  cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<service role="DASKWS" name="daskws" version="0.1.0">
+  <policies>
+    <policy role="webappsec"/>
+    <policy role="authentication" name="Anonymous"/>
+    <policy role="rewrite"/>
+    <policy role="authorization"/>
+  </policies>
+
+  <routes>
+
+    <route path="/dask/**/ws">
+      <rewrite apply="DASKWS/daskws/inbound/ws" to="request.url"/>
+    </route>
+
+  </routes>
+  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
+</service>
+EOF
+
+  cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF'
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+
+<rules>
+  <rule dir="IN" name="DASKWS/daskws/inbound/ws" pattern="ws://*:*/**/dask/{**}/ws">
+    <rewrite template="{$serviceUrl[DASKWS]}/{**}/ws"/>
+  </rule>
+</rules>
+EOF
+
+  chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}"
+
+  # Do not restart knox during pre-init script run
+  if [[ -n "${ROLE}" ]]; then
+    restart_knox
+  fi
+}
+
+function configure_fluentd_for_dask() {
+  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
+    cat >/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
+# Fluentd config for Dask logs
+
+# Dask scheduler
+<source>
+  @type tail
+  path /var/log/dask-scheduler.log
+  pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos
+  read_from_head true
+  tag google.dataproc.dask-scheduler
+  <parse>
+    @type none
+  </parse>
+</source>
+
+<filter google.dataproc.dask-scheduler>
+  @type record_transformer
+  <record>
+    filename dask-scheduler.log
+  </record>
+</filter>
+EOF
+  fi
+
+  if [[ "${enable_systemd_dask_worker_service}" == "1" ]]; then
+    cat >>/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
+# Dask worker
+<source>
+  @type tail
+  path /var/log/dask-worker.log
+  pos_file /var/tmp/fluentd.dataproc.dask.worker.pos
+  read_from_head true
+  tag google.dataproc.dask-worker
+  <parse>
+    @type none
+  </parse>
+</source>
+
+<filter google.dataproc.dask-worker>
+  @type record_transformer
+  <record>
+    filename dask-worker.log
+  </record>
+</filter>
+EOF
+  fi
+
+  systemctl restart google-fluentd
+}
+
+function install_dask() {
+  is_complete install.dask && return
+
+  local python_spec="python>=3.11"
+  local dask_version="2024.12.1"
+  local dask_spec="dask>=${dask_version}"
+  local cache_key_name="dask-${dask_version}"
+
+  CONDA_PACKAGES=()
+  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
+    dask_yarn_version="0.9"
+    cache_key_name="dask-yarn-${dask_yarn_version}"
+    # Pin `distributed` and `dask` package versions to old release
+    # because `dask-yarn` 0.9 uses skein in a way which
+    # is not compatible with `distributed` package 2022.2 and newer:
+    # https://github.com/dask/dask-yarn/issues/155
+
+    dask_spec="dask<2022.2"
+    python_spec="python>=3.7,<3.8.0a0"
+    if is_ubuntu18 ; then
+      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
+      CONDA_PACKAGES+=("fiona<1.8.22")
+    fi
+    CONDA_PACKAGES+=('dask-yarn=${dask_yarn_version}' "distributed<2022.2")
+  fi
+
+  CONDA_PACKAGES+=(
+    "${dask_spec}"
+    "dask-bigquery"
+    "dask-ml"
+    "dask-sql"
+  )
+
+  unset CONDA_CHANNEL_ARGS
+  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}"
+  install_conda_packages "${cache_key}"
+
+  mark_complete install.dask
+}
+
+function install_dask_rapids() {
+  if ( is_complete install.dask-rapids && test -d "${DASK_CONDA_ENV}" ) ; then return ; fi
+
+  local numba_spec="numba"
+  local dask_version="2024.7"
+  local dask_spec="dask>=${dask_version}"
+
+  local python_spec="python>=3.11"
+  local cuda_spec="cuda-version>=12,<13"
+  local cudart_spec="cuda-cudart"
+  if is_cuda11 ; then
+    python_spec="python>=3.9"
+    cuda_spec="cuda-version>=11,<12.0a0"
+    cudart_spec="cudatoolkit"
+  fi
+
+  local rapids_spec="rapids>=${RAPIDS_VERSION}"
+  CONDA_PACKAGES=()
+  local cache_key_name="dask-rapids-${RAPIDS_VERSION}"
+  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
+    local rapids_version="24.05"
+    cache_key_name="dask-rapids-yarn-${rapids_version}"
+    # Pin `distributed` and `dask` package versions to old release
+    # because `dask-yarn` 0.9 uses skein in a way which
+    # is not compatible with `distributed` package 2022.2 and newer:
+    # https://github.com/dask/dask-yarn/issues/155
+
+    dask_spec="dask<2022.2"
+    python_spec="python>=3.9"
+    rapids_spec="rapids<=${rapids_version}"
+    if is_ubuntu18 ; then
+      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
+      CONDA_PACKAGES+=("fiona<1.8.22")
+    fi
+    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
+  fi
+
+  CONDA_PACKAGES+=(
+    "${cuda_spec}"
+    "${cudart_spec}"
+    "${rapids_spec}"
+    "${dask_spec}"
+    "dask-bigquery"
+    "dask-ml"
+    "dask-sql"
+    "cudf"
+    "${numba_spec}"
+  )
+
+  CONDA_CHANNEL_ARGS="-c conda-forge -c nvidia -c rapidsai"
+
+  local cache_key="${cache_key_name}_${DATAPROC_IMAGE_VERSION}-${_shortname}"
+  install_conda_packages "${cache_key}"
+
+  mark_complete install.dask-rapids
+}
+
+# The bash array CONDA_PACKAGES must contain a set of package
+# specifications before calling this function
+
+# The bash string CONDA_CHANNEL_ARGS may contain arguments to specify
+# conda channels. Default is "-c 'conda-forge'"
+
+function install_conda_packages() {
+  local cache_key="${1}"
+
+  local build_tarball="${cache_key}.tar.gz"
+  local gcs_tarball="${pkg_bucket}/conda/${cache_key%%_*}/${build_tarball}"
+  local local_tarball="${tmpdir}/${build_tarball}"
+
+  if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+    echo "cache hit"
+    mkdir -p "${DASK_CONDA_ENV}"
+    time ( gcloud storage cat "${gcs_tarball}" | tar -C "${DASK_CONDA_ENV}" -xz )
+    return 0
+  fi
+
+  # Install cuda, rapids, dask
+  mamba="/opt/conda/miniconda3/bin/mamba"
+  conda="/opt/conda/miniconda3/bin/conda"
+
+  ( set +e
+  local is_installed="0"
+  for installer in "${mamba}" "${conda}" ; do
+    test -d "${DASK_CONDA_ENV}" || \
+      time "${installer}" "create" -m -n "${conda_env}" -y --no-channel-priority \
+      ${CONDA_CHANNEL_ARGS:- -c 'conda-forge'}  \
+      ${CONDA_PACKAGES[*]} \
+      "${python_spec}" \
+      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
+    sync
+    if [[ "$retval" == "0" ]] ; then
+      is_installed="1"
+      pushd "${DASK_CONDA_ENV}"
+      time (
+        tar czf "${local_tarball}" .
+        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+        rm "${local_tarball}"
+      )
+      popd
+      break
+    fi
+    "${conda}" config --set channel_priority flexible
+  done
+
+  if [[ "${is_installed}" == "0" ]]; then
+    echo "failed to install dask"
+    return 1
+  fi
+  )
+}
+
+function prepare_dask_env() {
+  # Dask config
+  DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
+  readonly DASK_RUNTIME
+  readonly DASK_SERVICE=dask-cluster
+  readonly DASK_WORKER_SERVICE=dask-worker
+  readonly DASK_SCHEDULER_SERVICE=dask-scheduler
+  readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/${conda_env}"
+  # Knox dask config
+  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
+  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
+}
+
+function prepare_dask_rapids_env(){
+  prepare_dask_env
+
+  # Default rapids runtime
+  readonly DEFAULT_RAPIDS_RUNTIME='DASK'
+
+  local DEFAULT_DASK_RAPIDS_VERSION="24.08"
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    DEFAULT_DASK_RAPIDS_VERSION="23.08" # Final release to support spark 3.1.3
+  fi
+  readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
+}
diff --git a/templates/generate-action.pl b/templates/generate-action.pl
new file mode 100644
index 000000000..7cc954a67
--- /dev/null
+++ b/templates/generate-action.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/perl -w
+# -*-CPerl-*-
+
+# Usage: Run this script from the root directory of the git clone:
+# perl templates/generate-action.pl gpu/install_gpu_driver.sh
+
+use Template;
+use strict;
+
+my $action = $ARGV[0];
+my $v = { template_path => "${action}.in" };
+
+sub usage{ die "Usage: $0 <action>" }
+
+usage unless( $action && -f "$ENV{PWD}/templates/$v->{template_path}" );
+
+my $tt = Template->new( {
+  INCLUDE_PATH => "$ENV{PWD}/templates",
+  VARIABLES => $v,
+  INTERPOLATE  => 0,
+}) || die "$Template::ERROR$/";
+
+
+$tt->process($v->{template_path}) or die( $tt->error(), "\n" );
diff --git a/templates/gpu/util_functions b/templates/gpu/util_functions
new file mode 100644
index 000000000..48473d13b
--- /dev/null
+++ b/templates/gpu/util_functions
@@ -0,0 +1,220 @@
+function set_support_matrix() {
+  # CUDA version and Driver version
+  # https://docs.nvidia.com/deploy/cuda-compatibility/
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://developer.nvidia.com/cuda-downloads
+
+  # Minimum supported version for open kernel driver is 515.43.04
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+  # Rocky8: 12.0: 525.147.05
+  local latest
+  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  readonly -A DRIVER_FOR_CUDA=(
+          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+  )
+  readonly -A DRIVER_SUBVER=(
+          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+  )
+  # https://developer.nvidia.com/cudnn-downloads
+  if is_debuntu ; then
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+  )
+  elif is_rocky ; then
+  # rocky:
+  #   12.0: 8.8.1.3
+  #   12.1: 8.9.3.28
+  #   12.2: 8.9.7.29
+  #   12.3: 9.0.0.312
+  #   12.4: 9.1.1.17
+  #   12.5: 9.2.1.18
+  #   12.6: 9.5.1.17
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
+  )
+  fi
+  # https://developer.nvidia.com/nccl/nccl-download
+  # 12.2: 2.19.3, 12.5: 2.21.5
+  readonly -A NCCL_FOR_CUDA=(
+          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+  )
+  readonly -A CUDA_SUBVER=(
+          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+  )
+}
+
+function set_cuda_version() {
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
+      exit 1
+      ;;
+  esac
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+  if [[ -n "${cuda_url}" ]] ; then
+    # if cuda-url metadata variable has been passed, extract default version from url
+    local CUDA_URL_VERSION
+    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+    fi
+  fi
+  readonly DEFAULT_CUDA_VERSION
+
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+    CUDA_FULL_VERSION="${CUDA_VERSION}"
+    CUDA_VERSION="${CUDA_VERSION%.*}"
+  fi
+  readonly CUDA_VERSION
+  if ( ! test -v CUDA_FULL_VERSION ) ; then
+    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+  fi
+  readonly CUDA_FULL_VERSION
+}
+
+function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
+
+function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
+
+function set_driver_version() {
+  local gpu_driver_url
+  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  local DEFAULT_DRIVER
+  # Take default from gpu-driver-url metadata value
+  if [[ -n "${gpu_driver_url}" ]] ; then
+    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+  # Take default from cuda-url metadata value as a backup
+  elif [[ -n "${cuda_url}" ]] ; then
+    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the version indicated by the cuda url as the default if it exists
+	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the maximum sub-version available for the major version indicated in cuda url as the default
+	DEFAULT_DRIVER="${driver_max_maj_version}"
+      fi
+    fi
+  fi
+
+  if ( ! test -v DEFAULT_DRIVER ) ; then
+    # If a default driver version has not been extracted, use the default for this version of CUDA
+    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+  fi
+
+  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+
+  readonly DRIVER_VERSION
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
+
+  export DRIVER_VERSION DRIVER
+
+  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+    exit 1
+  fi
+}
+
+function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
+function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
+
+function nvsmi() {
+  local nvsmi="/usr/bin/nvidia-smi"
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
+  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
+  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
+  else nvsmi_works="1" ; fi
+
+  if test -v 1 && [[ "$1" == "-L" ]] ; then
+    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
+    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
+    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
+
+    return 0
+  fi
+
+  "${nvsmi}" $*
+}
+
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
+
+function query_nvsmi() {
+  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
+}
+
+function prepare_gpu_env(){
+  set_support_matrix
+
+  set_cuda_version
+  set_driver_version
+
+  set +e
+  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  set -e
+  echo "gpu_count=[${gpu_count}]"
+  nvsmi_works="0"
+  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
+  xmllint="/opt/conda/miniconda3/bin/xmllint"
+  NVIDIA_SMI_PATH='/usr/bin'
+  MIG_MAJOR_CAPS=0
+  IS_MIG_ENABLED=0
+  CUDNN_PKG_NAME=""
+  CUDNN8_PKG_NAME=""
+  CUDA_LOCAL_REPO_INSTALLED="0"
+
+  if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
+    readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
+  fi
+
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
+  readonly RAPIDS_RUNTIME
+
+  # determine whether we have nvidia-smi installed and working
+  nvsmi
+}
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  if ! is_debuntu ; then return ; fi
+
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}
+
+function gpu_exit_handler() {
+  echo "no operations in gpu exit handler"
+}
diff --git a/templates/legal/license_header b/templates/legal/license_header
new file mode 100644
index 000000000..0230ca951
--- /dev/null
+++ b/templates/legal/license_header
@@ -0,0 +1,13 @@
+# Copyright 2015 Google LLC and contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/templates/rapids/rapids.sh.in b/templates/rapids/rapids.sh.in
index 3fd48089e..61b7247c0 100644
--- a/templates/rapids/rapids.sh.in
+++ b/templates/rapids/rapids.sh.in
@@ -1,493 +1,19 @@
 #!/bin/bash
-
-# Copyright 2019,2020,2021,2022,2024 Google LLC
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+[% INSERT legal/license_header %]
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+[% PROCESS common/template_disclaimer %]
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 # This initialization action script will install rapids on a Dataproc
 # cluster.
 
 set -euxo pipefail
 
-function os_id()       { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
-function is_ubuntu()   { [[ "$(os_id)" == 'ubuntu' ]] ; }
-function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; }
-function is_debian()   { [[ "$(os_id)" == 'debian' ]] ; }
-function is_debuntu()  { is_debian || is_ubuntu ; }
-
-function print_metadata_value() {
-  local readonly tmpfile=$(mktemp)
-  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
-    -s -o ${tmpfile} 2>/dev/null)
-  local readonly return_code=$?
-  # If the command completed successfully, print the metadata value to stdout.
-  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
-    cat ${tmpfile}
-  fi
-  rm -f ${tmpfile}
-  return ${return_code}
-}
-
-function print_metadata_value_if_exists() {
-  local return_code=1
-  local readonly url=$1
-  print_metadata_value ${url}
-  return_code=$?
-  return ${return_code}
-}
-
-function get_metadata_value() {
-  set +x
-  local readonly varname=$1
-  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
-  # Print the instance metadata value.
-  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
-  return_code=$?
-  # If the instance doesn't have the value, try the project.
-  if [[ ${return_code} != 0 ]]; then
-    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
-    return_code=$?
-  fi
-  set -x
-  return ${return_code}
-}
-
-function get_metadata_attribute() (
-  set +x
-  local -r attribute_name="$1"
-  local -r default_value="${2:-}"
-  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
-)
-
-function execute_with_retries() {
-  local -r cmd="$*"
-  for i in {0..9} ; do
-    if eval "$cmd"; then
-      return 0 ; fi
-    sleep 5
-  done
-  echo "Cmd '${cmd}' failed."
-  return 1
-}
-
-function restart_knox() {
-  systemctl stop knox
-  rm -rf "${KNOX_HOME}/data/deployments/*"
-  systemctl start knox
-}
-
-function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
-function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }
-
-function configure_dask_yarn() {
-  readonly DASK_YARN_CONFIG_DIR=/etc/dask/
-  readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
-  # Minimal custom configuration is required for this
-  # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage
-  # for information on tuning Dask-Yarn environments.
-  mkdir -p "${DASK_YARN_CONFIG_DIR}"
-
-  cat <<EOF >"${DASK_YARN_CONFIG_FILE}"
-# Config file for Dask Yarn.
-#
-# These values are joined on top of the default config, found at
-# https://yarn.dask.org/en/latest/configuration.html#default-configuration
-
-yarn:
-  environment: python://${DASK_CONDA_ENV}/bin/python
-
-  worker:
-    count: 2
-    gpus: 1
-    class: "dask_cuda.CUDAWorker"
-EOF
-}
-
-function install_systemd_dask_worker() {
-  echo "Installing systemd Dask Worker service..."
-  local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}"
-
-  mkdir -p "${dask_worker_local_dir}"
-
-  local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh"
-
-  cat <<EOF >"${DASK_WORKER_LAUNCHER}"
-#!/bin/bash
-LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
-nvidia-smi -c DEFAULT
-echo "dask-cuda-worker starting, logging to \${LOGFILE}"
-${DASK_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
-EOF
-
-  chmod 750 "${DASK_WORKER_LAUNCHER}"
-
-  local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service"
-  cat <<EOF >"${dask_service_file}"
-[Unit]
-Description=Dask Worker Service
-[Service]
-Type=simple
-Restart=on-failure
-ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}'
-[Install]
-WantedBy=multi-user.target
-EOF
-  chmod a+r "${dask_service_file}"
-
-  systemctl daemon-reload
-
-  # Enable the service
-  if [[ "${ROLE}" != "Master" ]]; then
-    enable_worker_service="1"
-  else
-     local RUN_WORKER_ON_MASTER=$(get_metadata_attribute dask-cuda-worker-on-master 'true')
-    # Enable service on single-node cluster (no workers)
-    local worker_count="$(get_metadata_attribute dataproc-worker-count)"
-    if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then
-      enable_worker_service="1"
-    fi
-  fi
-
-  if [[ "${enable_worker_service}" == "1" ]]; then
-    systemctl enable "${DASK_WORKER_SERVICE}"
-    systemctl restart "${DASK_WORKER_SERVICE}"
-  fi
-}
-
-function install_systemd_dask_scheduler() {
-  # only run scheduler on primary master
-  if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi
-  echo "Installing systemd Dask Scheduler service..."
-  local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}"
-
-  mkdir -p "${dask_scheduler_local_dir}"
-
-  local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh"
-
-  cat <<EOF >"${DASK_SCHEDULER_LAUNCHER}"
-#!/bin/bash
-LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
-echo "dask scheduler starting, logging to \${LOGFILE}"
-${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
-EOF
-
-  chmod 750 "${DASK_SCHEDULER_LAUNCHER}"
-
-  local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service"
-  cat <<EOF >"${dask_service_file}"
-[Unit]
-Description=Dask Scheduler Service
-[Service]
-Type=simple
-Restart=on-failure
-ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}'
-[Install]
-WantedBy=multi-user.target
-EOF
-  chmod a+r "${dask_service_file}"
-
-  systemctl daemon-reload
-
-  # Enable the service
-  systemctl enable "${DASK_SCHEDULER_SERVICE}"
-}
-
-function install_systemd_dask_service() {
-  install_systemd_dask_scheduler
-  install_systemd_dask_worker
-}
-
-function configure_knox_for_dask() {
-  if [[ ! -d "${KNOX_HOME}" ]]; then
-    echo "Skip configuring Knox rules for Dask"
-    return 0
-  fi
-
-  local DASK_UI_PORT=8787
-  if [[ -f /etc/knox/conf/topologies/default.xml ]]; then
-    sed -i \
-      "/<\/topology>/i <service><role>DASK<\/role><url>http://localhost:${DASK_UI_PORT}<\/url><\/service> <service><role>DASKWS<\/role><url>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \
-      /etc/knox/conf/topologies/default.xml
-  fi
-
-  mkdir -p "${KNOX_DASK_DIR}"
-
-  cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF'
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-
-<service role="DASK" name="dask" version="0.1.0">
-  <policies>
-    <policy role="webappsec"/>
-    <policy role="authentication" name="Anonymous"/>
-    <policy role="rewrite"/>
-    <policy role="authorization"/>
-  </policies>
-
-  <routes>
-    <!-- Javascript paths -->
-    <route path="/dask/**/*.js">
-      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
-      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
-    </route>
-    <route path="/dask/**/*.js?**">
-      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
-      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
-    </route>
-
-    <!-- CSS paths -->
-    <route path="/dask/**/*.css">
-      <rewrite apply="DASK/dask/inbound/css/dask" to="request.url"/>
-    </route>
-
-    <!-- General path routing -->
-    <route path="/dask">
-      <rewrite apply="DASK/dask/inbound/root" to="request.url"/>
-      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
-    </route>
-    <route path="/dask/**">
-      <rewrite apply="DASK/dask/inbound/root/path" to="request.url"/>
-      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
-      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
-    </route>
-    <route path="/dask/**?**">
-      <rewrite apply="DASK/dask/inbound/root/query" to="request.url"/>
-      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
-      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
-    </route>
-  </routes>
-  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
-</service>
-EOF
-
-  cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF'
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-
-<rules>
-  <rule dir="IN" name="DASK/dask/inbound/js/dask" pattern="http://*:*/**/dask/{**}?{**}">
-    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
-  </rule>
-  <rule dir="IN" name="DASK/dask/inbound/root" pattern="http://*:*/**/dask">
-    <rewrite template="{$serviceUrl[DASK]}"/>
-  </rule>
-  <rule dir="IN" name="DASK/dask/inbound/root/path" pattern="http://*:*/**/dask/{**}">
-    <rewrite template="{$serviceUrl[DASK]}/{**}"/>
-  </rule>
-  <rule dir="IN" name="DASK/dask/inbound/root/query" pattern="http://*:*/**/dask/{**}?{**}">
-    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
-  </rule>
-  <rule dir="IN" name="DASK/dask/inbound/css/dask" pattern="http://*:*/**/dask/{**}?{**}">
-    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
-  </rule>
-  <!-- without the /gateway/default prefix -->
-  <rule dir="IN" name="DASK/dask/inbound/root/noprefix" pattern="http://*:*/dask">
-    <rewrite template="{$serviceUrl[DASK]}"/>
-  </rule>
-
-  <rule dir="OUT" name="DASK/dask/outbound/logs" pattern="/logs">
-    <rewrite template="{$frontend[path]}/dask/info/logs"/>
-  </rule>
-
-  <!-- Rewrite redirect responses Location header -->
-  <filter name="DASK/dask/outbound/headers">
-    <content type="application/x-http-headers">
-      <apply path="Location" rule="DASK/dask/outbound/headers/location"/>
-    </content>
-  </filter>
-
-  <rule dir="OUT" name="DASK/dask/outbound/headers/location" flow="OR">
-    <match pattern="*://*:*/">
-      <rewrite template="{$frontend[path]}/dask/"/>
-    </match>
-    <match pattern="*://*:*/{**}">
-      <rewrite template="{$frontend[path]}/dask/{**}"/>
-    </match>
-    <match pattern="*://*:*/{**}?{**}">
-      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
-    </match>
-    <match pattern="/{**}">
-      <rewrite template="{$frontend[path]}/dask/{**}"/>
-    </match>
-    <match pattern="/{**}?{**}">
-      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
-    </match>
-  </rule>
-</rules>
-EOF
-
-  mkdir -p "${KNOX_DASKWS_DIR}"
-
-  cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF'
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-
-<service role="DASKWS" name="daskws" version="0.1.0">
-  <policies>
-    <policy role="webappsec"/>
-    <policy role="authentication" name="Anonymous"/>
-    <policy role="rewrite"/>
-    <policy role="authorization"/>
-  </policies>
-
-  <routes>
+[% INSERT common/util_functions %]
 
-    <route path="/dask/**/ws">
-      <rewrite apply="DASKWS/daskws/inbound/ws" to="request.url"/>
-    </route>
+[% INSERT gpu/util_functions %]
 
-  </routes>
-  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
-</service>
-EOF
-
-  cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF'
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-
-<rules>
-  <rule dir="IN" name="DASKWS/daskws/inbound/ws" pattern="ws://*:*/**/dask/{**}/ws">
-    <rewrite template="{$serviceUrl[DASKWS]}/{**}/ws"/>
-  </rule>
-</rules>
-EOF
-
-  chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}"
-
-  # Do not restart knox during pre-init script run
-  if [[ -n "${ROLE}" ]]; then
-    restart_knox
-  fi
-}
-
-function configure_fluentd_for_dask() {
-  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
-    cat >/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
-# Fluentd config for Dask logs
-
-# Dask scheduler
-<source>
-  @type tail
-  path /var/log/dask-scheduler.log
-  pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos
-  read_from_head true
-  tag google.dataproc.dask-scheduler
-  <parse>
-    @type none
-  </parse>
-</source>
-
-<filter google.dataproc.dask-scheduler>
-  @type record_transformer
-  <record>
-    filename dask-scheduler.log
-  </record>
-</filter>
-EOF
-  fi
-
-  if [[ "${enable_worker_service}" == "1" ]]; then
-    cat >>/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
-# Dask worker
-<source>
-  @type tail
-  path /var/log/dask-worker.log
-  pos_file /var/tmp/fluentd.dataproc.dask.worker.pos
-  read_from_head true
-  tag google.dataproc.dask-worker
-  <parse>
-    @type none
-  </parse>
-</source>
-
-<filter google.dataproc.dask-worker>
-  @type record_transformer
-  <record>
-    filename dask-worker.log
-  </record>
-</filter>
-EOF
-  fi
-
-  systemctl restart google-fluentd
-}
-
-function install_dask_rapids() {
-  if is_cuda12 ; then
-    local python_spec="python>=3.11"
-    local cuda_spec="cuda-version>=12,<13"
-    local dask_spec="dask>=2024.7"
-    local numba_spec="numba"
-  elif is_cuda11 ; then
-    local python_spec="python>=3.9"
-    local cuda_spec="cuda-version>=11,<12.0a0"
-    local dask_spec="dask"
-    local numba_spec="numba"
-  fi
-
-  rapids_spec="rapids>=${RAPIDS_VERSION}"
-  CONDA_PACKAGES=()
-  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
-    # Pin `distributed` and `dask` package versions to old release
-    # because `dask-yarn` 0.9 uses skein in a way which
-    # is not compatible with `distributed` package 2022.2 and newer:
-    # https://github.com/dask/dask-yarn/issues/155
-
-    dask_spec="dask<2022.2"
-    python_spec="python>=3.7,<3.8.0a0"
-    rapids_spec="rapids<=24.05"
-    if is_ubuntu18 ; then
-      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
-      CONDA_PACKAGES+=("fiona<1.8.22")
-    fi
-    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
-  fi
-
-  CONDA_PACKAGES+=(
-    "${cuda_spec}"
-    "${rapids_spec}"
-    "${dask_spec}"
-    "dask-bigquery"
-    "dask-ml"
-    "dask-sql"
-    "cudf"
-    "${numba_spec}"
-  )
-
-  # Install cuda, rapids, dask
-  mamba="/opt/conda/miniconda3/bin/mamba"
-  conda="/opt/conda/miniconda3/bin/conda"
-
-  "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]"
-
-  ( set +e
-  local is_installed="0"
-  for installer in "${mamba}" "${conda}" ; do
-    test -d "${DASK_CONDA_ENV}" || \
-      time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \
-      -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
-      ${CONDA_PACKAGES[*]} \
-      "${python_spec}" \
-      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
-    sync
-    if [[ "$retval" == "0" ]] ; then
-      is_installed="1"
-      break
-    fi
-    "${conda}" config --set channel_priority flexible
-  done
-  if [[ "${is_installed}" == "0" ]]; then
-    echo "failed to install dask"
-    return 1
-  fi
-  )
-}
+[% INSERT dask/util_functions %]
 
 function main() {
   # Install Dask with RAPIDS
@@ -496,166 +22,40 @@ function main() {
   # In "standalone" mode, Dask relies on a systemd unit to launch.
   # In "yarn" mode, it relies a config.yaml file.
   if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
-    # Create Dask YARN config file
+    # Create cuda accelerated Dask YARN config file
     configure_dask_yarn
+    echo "yarn setup complete"
   else
     # Create Dask service
     install_systemd_dask_service
-
-    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
-      systemctl start "${DASK_SCHEDULER_SERVICE}"
-      systemctl status "${DASK_SCHEDULER_SERVICE}"
-    fi
-
-    echo "Starting Dask 'standalone' cluster..."
-    if [[ "${enable_worker_service}" == "1" ]]; then
-      systemctl start "${DASK_WORKER_SERVICE}"
-      systemctl status "${DASK_WORKER_SERVICE}"
-    fi
+    start_systemd_dask_service
 
     configure_knox_for_dask
 
-    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
+    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging 'false')"
     if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
       configure_fluentd_for_dask
     fi
   fi
-
-  echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized."
-  if [[ "${ROLE}" == "Master" ]]; then
-    systemctl restart hadoop-yarn-resourcemanager.service
-    # Restart NodeManager on Master as well if this is a single-node-cluster.
-    if systemctl list-units | grep hadoop-yarn-nodemanager; then
-      systemctl restart hadoop-yarn-nodemanager.service
-    fi
-  else
-    systemctl restart hadoop-yarn-nodemanager.service
-  fi
 }
 
-function exit_handler() (
-  set +e
-  echo "Exit handler invoked"
-
-  # Free conda cache
-  /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1
-
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
-
-  # remove the tmpfs conda pkgs_dirs
-  if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi
-
-  # Clean up shared memory mounts
-  for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
-    if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
-      rm -rf ${shmdir}/*
-      umount -f ${shmdir}
-    fi
-  done
-
-  # Clean up OS package cache ; re-hold systemd package
-  if is_debuntu ; then
-    apt-get -y -qq clean
-    apt-get -y -qq autoremove
-  else
-    dnf clean all
-  fi
-
-  # print disk usage statistics
-  if is_debuntu ; then
-    # Rocky doesn't have sort -h and fails when the argument is passed
-    du --max-depth 3 -hx / | sort -h | tail -10
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f "${tmpdir}/keep-running-df"
-  sleep 6s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem      Size  Used Avail Use% Mounted on
-#/dev/vda2       6.8G  2.5G  4.0G  39% /
-  df -h / | tee -a "${tmpdir}/disk-usage.log"
-  perl -e '$max=( sort
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> )[-1];
-print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log"
-
-  echo "exit_handler has completed"
-
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero ; sync ; rm -f /zero
-  fi
-
+function exit_handler() {
+  gpu_exit_handler
+  pip_exit_handler
+  conda_exit_handler
+  common_exit_handler
   return 0
-)
+}
 
 function prepare_to_install(){
-  readonly DEFAULT_CUDA_VERSION="12.4"
-  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION})
-  readonly CUDA_VERSION
-
-  readonly ROLE=$(get_metadata_attribute dataproc-role)
-  readonly MASTER=$(get_metadata_attribute dataproc-master)
-
-  # RAPIDS config
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
-  readonly RAPIDS_RUNTIME
-
-  readonly DEFAULT_DASK_RAPIDS_VERSION="24.08"
-  readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})
-
-  # Dask config
-  DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
-  readonly DASK_RUNTIME
-  readonly DASK_SERVICE=dask-cluster
-  readonly DASK_WORKER_SERVICE=dask-worker
-  readonly DASK_SCHEDULER_SERVICE=dask-scheduler
-  readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask-rapids"
-
-  # Knox config
-  readonly KNOX_HOME=/usr/lib/knox
-  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
-  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
-  enable_worker_service="0"
-
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  # Write to a ramdisk instead of churning the persistent disk
-  if [[ ${free_mem} -ge 5250000 ]]; then
-    tmpdir=/mnt/shm
-    mkdir -p /mnt/shm
-    mount -t tmpfs tmpfs /mnt/shm
-
-    # Download conda packages to tmpfs
-    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm
-    mount -t tmpfs tmpfs /mnt/shm
-
-    # Download pip packages to tmpfs
-    pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir"
-
-    # Download OS packages to tmpfs
-    if is_debuntu ; then
-      mount -t tmpfs tmpfs /var/cache/apt/archives
-    else
-      mount -t tmpfs tmpfs /var/cache/dnf
-    fi
-  else
-    tmpdir=/tmp
-  fi
-  install_log="${tmpdir}/install.log"
+  prepare_common_env
+  conda_env="$(get_metadata_attribute conda-env 'dask-rapids')"
+  readonly conda_env
+  prepare_dask_rapids_env
+  prepare_conda_env
+  prepare_pip_env
+  prepare_gpu_env
   trap exit_handler EXIT
-
-  # Monitor disk usage in a screen session
-  if is_debuntu ; then
-      apt-get install -y -qq screen
-  else
-      dnf -y -q install screen
-  fi
-  df -h / | tee "${tmpdir}/disk-usage.log"
-  touch "${tmpdir}/keep-running-df"
-  screen -d -m -US keep-running-df \
-    bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done"
 }
 
 prepare_to_install

From 1a42f2ace3283f658f034b065a076375001d75ae Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 00:04:26 -0800
Subject: [PATCH 04/15] Template::Toolkit dependency

---
 cloudbuild/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index aebaffd84..644219305 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -22,7 +22,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
     dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
 RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} \
+                           libtemplate-perl > /dev/null 2>&1 && \
     apt-get clean
 
 # Set bazel-${bazel_version} as the default bazel alternative in this container

From b923a13d65ab2d0a3cb0e194ce253ce56b9ffb36 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 14:57:03 -0800
Subject: [PATCH 05/15] mark generated actions as changed when templates are
 changed

---
 cloudbuild/presubmit.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
index fc664f1bf..d7f2edb04 100644
--- a/cloudbuild/presubmit.sh
+++ b/cloudbuild/presubmit.sh
@@ -48,14 +48,14 @@ initialize_git_repo() {
 # to determine all changed files and looks for tests in directories with changed files.
 determine_tests_to_run() {
   # Infer the files that changed
-  mapfile -t CHANGED_ACTION_TEMPLATES < <(git diff origin/master --name-only | grep 'templates/.*/.*\.sh\.in')
-  for tt in "${CHANGED_ACTION_TEMPLATES[@]}"; do
+  mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
+  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template)
+  for tt in $(git diff origin/master --name-only | grep 'templates/.*/.*\.sh\.in'); do
     local genfile=`perl -e "print( q{${tt}} =~ m:templates/(.*?.sh).in: )"`
     perl templates/generate-action.pl "${genfile}" > "${genfile}"
+    CHANGED_FILES+=("${genfile}")
   done
 
-  mapfile -t DELETED_BUILD_FILES < <(git diff origin/master --name-only --diff-filter=D | grep BUILD)
-  mapfile -t CHANGED_FILES < <(git diff origin/master --name-only | grep -v template)
   echo "Deleted BUILD files: ${DELETED_BUILD_FILES[*]}"
   echo "Changed files: ${CHANGED_FILES[*]}"
 

From c20684fc4c35f4061899a152829b9838df6cec82 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 17:22:29 -0800
Subject: [PATCH 06/15] retry tar creation and verify before caching to gcs

---
 templates/dask/util_functions | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index d67da1fc1..019bd6778 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -510,9 +510,9 @@ function install_conda_packages() {
     if [[ "$retval" == "0" ]] ; then
       is_installed="1"
       pushd "${DASK_CONDA_ENV}"
-      time (
-        tar czf "${local_tarball}" .
-        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      time ( set -e
+        execute_with_retries "tar czf ${local_tarball} . && tar tzf ${local_tarball}"
+        execute_with_retries gcloud storage cp "${local_tarball}" "${gcs_tarball}"
         rm "${local_tarball}"
       )
       popd

From 2102ef38ff77212272ce4d7eff95357af91408bb Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 18:26:24 -0800
Subject: [PATCH 07/15] root cause was ramdisk exhaustion.  increase minimum
 memory requirements for ramdisk

---
 templates/common/util_functions | 2 +-
 templates/dask/util_functions   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/templates/common/util_functions b/templates/common/util_functions
index 0f0bfeaa6..29282ca31 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -297,7 +297,7 @@ function is_ramdisk() {
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+  if [[ ${free_mem} -lt 30000000 ]]; then return 0 ; fi
 
   # Write to a ramdisk instead of churning the persistent disk
 
diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 019bd6778..ce6964e94 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -511,8 +511,8 @@ function install_conda_packages() {
       is_installed="1"
       pushd "${DASK_CONDA_ENV}"
       time ( set -e
-        execute_with_retries "tar czf ${local_tarball} . && tar tzf ${local_tarball}"
-        execute_with_retries gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+        tar czf "${local_tarball}" . && tar tzf "${local_tarball}"
+        gcloud storage cp "${local_tarball}" "${gcs_tarball}"
         rm "${local_tarball}"
       )
       popd

From 19124c0ce80aa6117ec12fc743a3d7b0188947d1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 19:35:43 -0800
Subject: [PATCH 08/15] using larger machine type to make use of ramdisk ;
 relaxing free_mem requirement a bit

---
 rapids/test_rapids.py           | 3 +--
 templates/common/util_functions | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py
index 63fa72a7f..02838ff08 100644
--- a/rapids/test_rapids.py
+++ b/rapids/test_rapids.py
@@ -43,7 +43,6 @@ def run_dask_script(self, name):
     self.assert_instance_command(name, verify_cmd)
     self.remove_test_script(self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME, name)
 
-                            
   @parameterized.parameters(
 # If a new version of dask-yarn is released, add this test back in.
 #    ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"),
@@ -61,7 +60,7 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator,
         configuration,
         self.INIT_ACTIONS,
         metadata=metadata,
-        machine_type="n1-standard-8",
+        machine_type="n1-highmem-8",
         master_accelerator=accelerator,
         worker_accelerator=accelerator,
         boot_disk_size="50GB",
diff --git a/templates/common/util_functions b/templates/common/util_functions
index 29282ca31..336af37f8 100644
--- a/templates/common/util_functions
+++ b/templates/common/util_functions
@@ -297,7 +297,7 @@ function is_ramdisk() {
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 30000000 ]]; then return 0 ; fi
+  if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi
 
   # Write to a ramdisk instead of churning the persistent disk
 

From 798423dcd7530a2c47e1bea49cd94584f1931cc2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 20:22:12 -0800
Subject: [PATCH 09/15] increasing the max-idle time

---
 integration_tests/dataproc_test_case.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py
index 683109125..ce7656c29 100644
--- a/integration_tests/dataproc_test_case.py
+++ b/integration_tests/dataproc_test_case.py
@@ -182,7 +182,7 @@ def createCluster(self,
         if not FLAGS.skip_cleanup:
           args.append("--max-age=60m")
 
-        args.append("--max-idle=25m")
+        args.append("--max-idle=45m")
 
         cmd = "{} dataproc clusters create {} {}".format(
             "gcloud beta" if beta else "gcloud", self.name, " ".join(args))

From c30880a819ab593d2fdc13da963fbcdb7422f6a1 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 21:12:23 -0800
Subject: [PATCH 10/15] use a more recent gpu installer

---
 gpu/install_gpu_driver.sh | 2930 +++++++++++++++++++++++--------------
 1 file changed, 1841 insertions(+), 1089 deletions(-)

diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 25efb2a49..20beac086 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 #
+# Copyright 2015 Google LLC and contributors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,6 +13,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+#
+# This initialization action is generated from
+# initialization-actions/templates/gpu/install_gpu_driver.sh.in
+#
+# Modifications made directly to the generated file will be lost when
+# the template is re-evaluated
+
 #
 # This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
 
@@ -20,32 +30,38 @@ function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | x
 function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
 function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )
 
+# For version (or real number) comparison
+# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second
+# ( version_ge 2.0 2.1 ) evaluates to false
+# ( version_ge 2.2 2.1 ) evaluates to true
 function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
 function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
 function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
 function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )
 
-readonly -A supported_os=(
-  ['debian']="10 11 12"
-  ['rocky']="8 9"
-  ['ubuntu']="18.04 20.04 22.04"
-)
-
-# dynamically define OS version test utility functions
-if [[ "$(os_id)" == "rocky" ]];
-then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
-else _os_version="$(os_version)"; fi
-for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
-  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
-
-  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
-    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
-    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
-    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+function define_os_comparison_functions() {
+
+  readonly -A supported_os=(
+    ['debian']="10 11 12"
+    ['rocky']="8 9"
+    ['ubuntu']="18.04 20.04 22.04"
+  )
+
+  # dynamically define OS version test utility functions
+  if [[ "$(os_id)" == "rocky" ]];
+  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
+  else _os_version="$(os_version)"; fi
+  for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
+    eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"
+
+    for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
+      eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
+      eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
+      eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
+    done
   done
-done
-
-function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )
+  eval "function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )"
+}
 
 function os_vercat()   ( set +x
   if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
@@ -53,7 +69,7 @@ function os_vercat()   ( set +x
                    else os_version ; fi ; )
 
 function repair_old_backports {
-  if ge_debian12 || ! is_debuntu ; then return ; fi
+  if ! is_debuntu ; then return ; fi
   # This script uses 'apt-get update' and is therefore potentially dependent on
   # backports repositories which have been archived.  In order to mitigate this
   # problem, we will use archive.debian.org for the oldoldstable repo
@@ -94,6 +110,7 @@ function print_metadata_value_if_exists() {
   return ${return_code}
 }
 
+# replicates /usr/share/google/get_metadata_value
 function get_metadata_value() (
   set +x
   local readonly varname=$1
@@ -117,226 +134,13 @@ function get_metadata_attribute() (
   get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
 )
 
-OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-readonly OS_NAME
-
-# node role
-ROLE="$(get_metadata_attribute dataproc-role)"
-readonly ROLE
-
-# CUDA version and Driver version
-# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
-# https://developer.nvidia.com/cuda-downloads
-# Rocky8: 12.0: 525.147.05
-readonly -A DRIVER_FOR_CUDA=(
-          ["11.8"]="560.35.03"
-          ["12.0"]="525.60.13"  ["12.4"]="560.35.03"  ["12.6"]="560.35.03"
-)
-# https://developer.nvidia.com/cudnn-downloads
-if is_debuntu ; then
-readonly -A CUDNN_FOR_CUDA=(
-          ["11.8"]="9.5.1.17"
-          ["12.0"]="9.5.1.17"   ["12.4"]="9.5.1.17"   ["12.6"]="9.5.1.17"
-)
-elif is_rocky ; then
-# rocky:
-#   12.0: 8.8.1.3
-#   12.1: 8.9.3.28
-#   12.2: 8.9.7.29
-#   12.3: 9.0.0.312
-#   12.4: 9.1.1.17
-#   12.5: 9.2.1.18
-#   12.6: 9.5.1.17
-readonly -A CUDNN_FOR_CUDA=(
-          ["11.8"]="9.5.1.17"
-          ["12.0"]="8.8.1.3"   ["12.4"]="9.1.1.17"   ["12.6"]="9.5.1.17"
-)
-fi
-# https://developer.nvidia.com/nccl/nccl-download
-# 12.2: 2.19.3, 12.5: 2.21.5
-readonly -A NCCL_FOR_CUDA=(
-          ["11.8"]="2.15.5"
-          ["12.0"]="2.16.5"  ["12.4"]="2.23.4"     ["12.6"]="2.23.4"
-)
-readonly -A CUDA_SUBVER=(
-          ["11.8"]="11.8.0"
-          ["12.0"]="12.0.0"  ["12.4"]="12.4.1"     ["12.6"]="12.6.2"
-)
-
-RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-readonly DEFAULT_CUDA_VERSION='12.4'
-CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
-if ( ( ge_debian12 || ge_rocky9 ) && version_le "${CUDA_VERSION%%.*}" "11" ) ; then
-  # CUDA 11 no longer supported on debian12 - 2024-11-22, rocky9 - 2024-11-27
-  CUDA_VERSION="${DEFAULT_CUDA_VERSION}"
-fi
-
-if ( version_ge "${CUDA_VERSION}" "12" && (le_debian11 || le_ubuntu18) ) ; then
-  # Only CUDA 12.0 supported on older debuntu
-  CUDA_VERSION="12.0"
-fi
-readonly CUDA_VERSION
-readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}"
-
-function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
-function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
-function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
-
-function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
-function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
-function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
-
-DEFAULT_DRIVER="${DRIVER_FOR_CUDA[${CUDA_VERSION}]}"
-if ( ge_ubuntu22 && version_le "${CUDA_VERSION}" "12.0" ) ; then
-                                         DEFAULT_DRIVER="560.28.03"  ; fi
-if ( is_debian11 || is_ubuntu20 ) ; then DEFAULT_DRIVER="560.28.03"  ; fi
-if ( is_rocky    && le_cuda11 )   ; then DEFAULT_DRIVER="525.147.05" ; fi
-if ( is_ubuntu20 && le_cuda11 )   ; then DEFAULT_DRIVER="535.183.06" ; fi
-if ( is_rocky9   && ge_cuda12 )   ; then DEFAULT_DRIVER="565.57.01"  ; fi
-DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
-
-readonly DRIVER_VERSION
-readonly DRIVER=${DRIVER_VERSION%%.*}
-
-readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-# Parameters for NVIDIA-provided cuDNN library
-readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
-# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
-  # cuDNN v8 is not distribution for ubuntu20+, debian12
-  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
-  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-  CUDNN_VERSION="8.8.0.121"
-fi
-readonly CUDNN_VERSION
-
-readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
-readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
-
-# Parameters for NVIDIA-provided Debian GPU driver
-readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
-
-readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
-
-# Short name for urls
-if is_ubuntu22  ; then
-    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
-    # https://developer.download.nvidia.com/compute/machine-learning/repos/
-    # use packages from previous release until such time as nvidia
-    # release ubuntu2204 builds
-
-    nccl_shortname="ubuntu2004"
-    shortname="$(os_id)$(os_vercat)"
-elif ge_rocky9 ; then
-    # use packages from previous release until such time as nvidia
-    # release rhel9 builds
-
-    nccl_shortname="rhel8"
-    shortname="rhel9"
-elif is_rocky ; then
-    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-    nccl_shortname="${shortname}"
-else
-    shortname="$(os_id)$(os_vercat)"
-    nccl_shortname="${shortname}"
-fi
-
-# Parameters for NVIDIA-provided package repositories
-readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
-
-# Parameters for NVIDIA-provided NCCL library
-readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
-NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
-readonly NCCL_REPO_URL
-readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub
-
-function set_cuda_runfile_url() {
-  local RUNFILE_DRIVER_VERSION="${DRIVER_VERSION}"
-  local RUNFILE_CUDA_VERSION="${CUDA_FULL_VERSION}"
-
-  if ge_cuda12 ; then
-    if ( le_debian11 || le_ubuntu18 ) ; then
-      RUNFILE_DRIVER_VERSION="525.60.13"
-      RUNFILE_CUDA_VERSION="12.0.0"
-    elif ( le_rocky8 && version_le "${DATAPROC_IMAGE_VERSION}" "2.0" ) ; then
-      RUNFILE_DRIVER_VERSION="525.147.05"
-      RUNFILE_CUDA_VERSION="12.0.0"
-    fi
-  else
-    RUNFILE_DRIVER_VERSION="520.61.05"
-    RUNFILE_CUDA_VERSION="11.8.0"
-  fi
-
-  readonly RUNFILE_FILENAME="cuda_${RUNFILE_CUDA_VERSION}_${RUNFILE_DRIVER_VERSION}_linux.run"
-  CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${RUNFILE_CUDA_VERSION}"
-  DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${RUNFILE_FILENAME}"
-  readonly DEFAULT_NVIDIA_CUDA_URL
-
-  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
-  readonly NVIDIA_CUDA_URL
-}
-
-set_cuda_runfile_url
-
-# Parameter for NVIDIA-provided Rocky Linux GPU driver
-readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
-
-CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
-CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
-if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
-  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
-  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
-    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
-    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
-  fi
-  # Use legacy url format with one of the tarball name formats depending on version as above
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
-fi
-if ( version_ge "${CUDA_VERSION}" "12.0" ); then
-  # Use modern url format When cuda version is greater than or equal to 12.0
-  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
-  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
-fi
-readonly CUDNN_TARBALL
-readonly CUDNN_TARBALL_URL
-
-# Whether to install NVIDIA-provided or OS-provided GPU driver
-GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-readonly GPU_DRIVER_PROVIDER
-
-# Stackdriver GPU agent parameters
-readonly GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
-# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-readonly INSTALL_GPU_AGENT
-
-# Dataproc configurations
-readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-readonly HIVE_CONF_DIR='/etc/hive/conf'
-readonly SPARK_CONF_DIR='/etc/spark/conf'
-
-NVIDIA_SMI_PATH='/usr/bin'
-MIG_MAJOR_CAPS=0
-IS_MIG_ENABLED=0
-
 function execute_with_retries() (
   set +x
   local -r cmd="$*"
 
   if [[ "$cmd" =~ "^apt-get install" ]] ; then
     apt-get -y clean
-    apt-get -y autoremove
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
   fi
   for ((i = 0; i < 3; i++)); do
     set -x
@@ -348,222 +152,234 @@ function execute_with_retries() (
   return 1
 )
 
-CUDA_KEYRING_PKG_INSTALLED="0"
-function install_cuda_keyring_pkg() {
-  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
-  local kr_ver=1.1
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
-    -o "${tmpdir}/cuda-keyring.deb"
-  dpkg -i "${tmpdir}/cuda-keyring.deb"
-  rm -f "${tmpdir}/cuda-keyring.deb"
-  CUDA_KEYRING_PKG_INSTALLED="1"
-}
-
-function uninstall_cuda_keyring_pkg() {
-  apt-get purge -yq cuda-keyring
-  CUDA_KEYRING_PKG_INSTALLED="0"
-}
+function cache_fetched_package() {
+  local src_url="$1"
+  local gcs_fn="$2"
+  local local_fn="$3"
 
-CUDA_LOCAL_REPO_INSTALLED="0"
-function install_local_cuda_repo() {
-  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  CUDA_LOCAL_REPO_INSTALLED="1"
-  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
-  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
-  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
-  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
-  readonly DIST_KEYRING_DIR="/var/${pkgname}"
+  while ! command -v gcloud ; do sleep 5s ; done
 
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
+    time gcloud storage cp "${gcs_fn}" "${local_fn}"
+  else
+    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
+           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
+  fi
+}
 
-  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
-  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+function add_contrib_component() {
+  if ! is_debuntu ; then return ; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-kernel-open-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib"
 
-  if is_ubuntu ; then
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
-      -o /etc/apt/preferences.d/cuda-repository-pin-600
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
   fi
 }
-function uninstall_local_cuda_repo(){
-  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
-  CUDA_LOCAL_REPO_INSTALLED="0"
+
+function set_hadoop_property() {
+  local -r config_file=$1
+  local -r property=$2
+  local -r value=$3
+  "${bdcfg}" set_property \
+    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
+    --name "${property}" --value "${value}" \
+    --clobber
 }
 
-CUDNN_LOCAL_REPO_INSTALLED="0"
-CUDNN_PKG_NAME=""
-function install_local_cudnn_repo() {
-  if [[ "${CUDNN_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  pkgname="cudnn-local-repo-${shortname}-${CUDNN}"
-  CUDNN_PKG_NAME="${pkgname}"
-  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN}/local_installers/${local_deb_fn}"
+function clean_up_sources_lists() {
+  #
+  # bigtop (primary)
+  #
+  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
 
-  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
+  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
+    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
 
-  dpkg -i "${tmpdir}/local-installer.deb"
+    local regional_bigtop_repo_uri
+    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
+      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
+      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
+      cut -d ' ' -f 2 |
+      head -1)
 
-  rm -f "${tmpdir}/local-installer.deb"
+    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+    else
+      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+    fi
 
-  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
+    rm -f "${bigtop_kr_path}"
+    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
 
-  CUDNN_LOCAL_REPO_INSTALLED="1"
-}
+    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  fi
 
-function uninstall_local_cudnn_repo() {
-  apt-get purge -yq "${CUDNN_PKG_NAME}"
-  CUDNN_LOCAL_REPO_INSTALLED="0"
-}
+  #
+  # adoptium
+  #
+  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
+  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
+  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
+  rm -f "${adoptium_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
+   | gpg --dearmor -o "${adoptium_kr_path}"
+  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
+   > /etc/apt/sources.list.d/adoptium.list
 
-CUDNN8_LOCAL_REPO_INSTALLED="0"
-CUDNN8_PKG_NAME=""
-function install_local_cudnn8_repo() {
-  if [[ "${CUDNN8_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
-  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
-  elif is_debian ; then cudnn8_shortname="debian11"
-  else return 0 ; fi
-  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
-  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
-  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
 
-  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
-  CUDNN8_PKG_NAME="${pkgname}"
+  #
+  # docker
+  #
+  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
+  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
+  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
 
-  deb_fn="${pkgname}_1.0-1_amd64.deb"
-  local_deb_fn="${tmpdir}/${deb_fn}"
-  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
-  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
-      "${local_deb_url}" -o "${local_deb_fn}"
+  rm -f "${docker_kr_path}"
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
+    | gpg --dearmor -o "${docker_kr_path}"
+  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
+    > ${docker_repo_file}
 
-  dpkg -i "${local_deb_fn}"
+  #
+  # google cloud + logging/monitoring
+  #
+  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
+    rm -f /usr/share/keyrings/cloud.google.gpg
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
+    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
+      list_file="/etc/apt/sources.list.d/${list}.list"
+      if [[ -f "${list_file}" ]]; then
+        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+      fi
+    done
+  fi
 
-  rm -f "${local_deb_fn}"
+  #
+  # cran-r
+  #
+  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
+    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
+    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
+    rm -f /usr/share/keyrings/cran-r.gpg
+    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
+      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
+    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+  fi
 
-  cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  CUDNN8_LOCAL_REPO_INSTALLED="1"
-}
+  #
+  # mysql
+  #
+  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
+    rm -f /usr/share/keyrings/mysql.gpg
+    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
+      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
+    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+  fi
+
+  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
 
-function uninstall_local_cudnn8_repo() {
-  apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  CUDNN8_LOCAL_REPO_INSTALLED="0"
 }
 
-function install_nvidia_nccl() {
-  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
+function set_proxy(){
+  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"
 
-  if is_rocky ; then
-    execute_with_retries \
-      dnf -y -q install \
-        "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}"
-    sync
-  elif is_ubuntu ; then
-    install_cuda_keyring_pkg
+  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi
 
-    apt-get update -qq
+  export METADATA_HTTP_PROXY
+  export http_proxy="${METADATA_HTTP_PROXY}"
+  export https_proxy="${METADATA_HTTP_PROXY}"
+  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
+  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
+  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
+  local no_proxy_svc
+  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
+                      bigquery composer      pubsub bigquerydatatransfer dataflow \
+                      storage  datafusion    ; do
+    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
+  done
 
-    if is_ubuntu18 ; then
-      execute_with_retries \
-        apt-get install -q -y \
-          libnccl2 libnccl-dev
-      sync
-    else
-      execute_with_retries \
-        apt-get install -q -y \
-          "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}"
-      sync
-    fi
+  export NO_PROXY="${no_proxy}"
+}
+
+function is_ramdisk() {
+  if [[ "${1:-}" == "-f" ]] ; then unset IS_RAMDISK ; fi
+  if   ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "true" ) ; then return 0
+  elif ( test -v IS_RAMDISK && "${IS_RAMDISK}" == "false" ) ; then return 1 ; fi
+
+  if ( test -d /mnt/shm && grep -q /mnt/shm /proc/mounts ) ; then
+    IS_RAMDISK="true"
+    return 0
   else
-    echo "Unsupported OS: '${OS_NAME}'"
-    # NB: this tarball is 10GB in size, but can be used to install NCCL on non-ubuntu systems
-    # wget https://developer.download.nvidia.com/hpc-sdk/24.7/nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
-    # tar xpzf nvhpc_2024_247_Linux_x86_64_cuda_multi.tar.gz
-    # nvhpc_2024_247_Linux_x86_64_cuda_multi/install
-    return
+    IS_RAMDISK="false"
+    return 1
   fi
 }
 
-function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
-function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
+function mount_ramdisk(){
+  local free_mem
+  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
+  if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi
 
-function install_nvidia_cudnn() {
-  local major_version
-  major_version="${CUDNN_VERSION%%.*}"
-  local cudnn_pkg_version
-  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
+  # Write to a ramdisk instead of churning the persistent disk
 
-  if is_rocky ; then
-    if is_cudnn8 ; then
-      execute_with_retries dnf -y -q install \
-        "libcudnn${major_version}" \
-        "libcudnn${major_version}-devel"
-      sync
-    elif is_cudnn9 ; then
-      execute_with_retries dnf -y -q install \
-        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
-        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
-      sync
-    else
-      echo "Unsupported cudnn version: '${major_version}'"
-    fi
-  elif is_debuntu; then
-    if ge_debian12 && is_src_os ; then
-      apt-get -y install nvidia-cudnn
-    else
-      local CUDNN="${CUDNN_VERSION%.*}"
-      if is_cudnn8 ; then
-        install_local_cudnn8_repo
+  tmpdir="/mnt/shm"
+  mkdir -p "${tmpdir}/pkgs_dirs"
+  mount -t tmpfs tmpfs "${tmpdir}"
 
-        apt-get update -qq
+  # Download conda packages to tmpfs
+  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}/pkgs_dirs"
 
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-            "libcudnn8=${cudnn_pkg_version}" \
-            "libcudnn8-dev=${cudnn_pkg_version}"
-	sync
-      elif is_cudnn9 ; then
-	install_cuda_keyring_pkg
+  # Download OS packages to tmpfs
+  if is_debuntu ; then
+    mount -t tmpfs tmpfs /var/cache/apt/archives
+  else
+    mount -t tmpfs tmpfs /var/cache/dnf
+  fi
+  is_ramdisk -f
+}
 
-        apt-get update -qq
+function check_os() {
+  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
+      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
+      exit 1
+  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
+      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
+      exit 1
+  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
+      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
+      exit 1
+  fi
 
-        execute_with_retries \
-          apt-get -y install --no-install-recommends \
-          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
-          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
-	sync
-      else
-        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
-      fi
-    fi
-  elif is_ubuntu ; then
-    local -a packages
-    packages=(
-      "libcudnn${major_version}=${cudnn_pkg_version}"
-      "libcudnn${major_version}-dev=${cudnn_pkg_version}")
-    execute_with_retries \
-      apt-get install -q -y --no-install-recommends "${packages[*]}"
-    sync
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
+  readonly SPARK_VERSION
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
     exit 1
   fi
 
-  ldconfig
-
-  echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
+    else
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
+    fi
+  fi
 }
 
-CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
-PSN="$(get_metadata_attribute private_secret_name)"
-readonly PSN
 function configure_dkms_certs() {
-  if [[ -z "${PSN}" ]]; then
+  if test -v PSN && [[ -z "${PSN}" ]]; then
       echo "No signing secret provided.  skipping";
       return 0
   fi
@@ -575,28 +391,27 @@ function configure_dkms_certs() {
     echo "Private key material exists"
 
     local expected_modulus_md5sum
-    expected_modulus_md5sum=$(get_metadata_attribute cert_modulus_md5sum)
+    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
     if [[ -n "${expected_modulus_md5sum}" ]]; then
       modulus_md5sum="${expected_modulus_md5sum}"
-    else
-      modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"
-    fi
 
-    # Verify that cert md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in \"${CA_TMPDIR}/db.rsa\" | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched rsa key modulus"
-    fi
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
+      # Verify that cert md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched rsa key"
+      fi
 
-    # Verify that key md5sum matches expected md5sum
-    if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in /var/lib/dkms/mok.pub | openssl md5 | awk '{print $2}')" ]]; then
-        echo "unmatched x509 cert modulus"
+      # Verify that key md5sum matches expected md5sum
+      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
+        echo "unmatched x509 cert"
+      fi
+    else
+      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
     fi
+    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
 
     return
   fi
 
-
   # Retrieve cloud secrets keys
   local sig_priv_secret_name
   sig_priv_secret_name="${PSN}"
@@ -623,16 +438,14 @@ function configure_dkms_certs() {
       | base64 --decode \
       | dd status=none of="${CA_TMPDIR}/db.der"
 
-  # symlink private key and copy public cert from volatile storage for DKMS
-  if is_ubuntu ; then
-    mkdir -p /var/lib/shim-signed/mok
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/shim-signed/mok/MOK.priv
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/shim-signed/mok/MOK.der
-  else
-    mkdir -p /var/lib/dkms/
-    ln -sf "${CA_TMPDIR}/db.rsa" /var/lib/dkms/mok.key
-    cp -f "${CA_TMPDIR}/db.der" /var/lib/dkms/mok.pub
-  fi
+  local mok_directory="$(dirname "${mok_key}")"
+  mkdir -p "${mok_directory}"
+
+  # symlink private key and copy public cert from volatile storage to DKMS directory
+  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
+  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"
+
+  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
 }
 
 function clear_dkms_key {
@@ -640,430 +453,474 @@ function clear_dkms_key {
       echo "No signing secret provided.  skipping" >&2
       return 0
   fi
-  rm -rf "${CA_TMPDIR}" /var/lib/dkms/mok.key /var/lib/shim-signed/mok/MOK.priv
+  rm -rf "${CA_TMPDIR}" "${mok_key}"
 }
 
-function add_contrib_component() {
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-kernel-open-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib"
+function check_secure_boot() {
+  local SECURE_BOOT="disabled"
+  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
 
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
+  PSN="$(get_metadata_attribute private_secret_name)"
+  readonly PSN
+
+  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
+    echo "Error: Secure Boot is not supported on Debian before image 2.2. Consider disabling Secure Boot while creating the cluster."
+    return
+  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
+    echo "Secure boot is enabled, but no signing material provided."
+    echo "Consider either disabling secure boot or provide signing material as per"
+    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
+    return
   fi
-}
 
-function add_nonfree_components() {
-  if is_src_nvidia ; then return; fi
-  if ge_debian12 ; then
-      # Include in sources file components on which nvidia-open-kernel-dkms depends
-      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
-      local components="main contrib non-free non-free-firmware"
+  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
+  readonly CA_TMPDIR
 
-      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
-  elif is_debian ; then
-      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
-  fi
+  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
+                      mok_der=/var/lib/shim-signed/mok/MOK.der
+                 else mok_key=/var/lib/dkms/mok.key
+                      mok_der=/var/lib/dkms/mok.pub ; fi
 }
 
-function add_repo_nvidia_container_toolkit() {
-  if is_debuntu ; then
-      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
-      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
-      test -f "${kr_path}" ||
-        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-          | gpg --dearmor -o "${kr_path}"
-
-      test -f "${sources_list_path}" ||
-        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
-          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
-          | tee "${sources_list_path}"
-  fi
+function restart_knox() {
+  systemctl stop knox
+  rm -rf "${KNOX_HOME}/data/deployments/*"
+  systemctl start knox
 }
 
-function add_repo_cuda() {
-  if is_debuntu ; then
-    local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
-    local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
-    echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
-    | sudo tee "${sources_list_path}"
-    curl "${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64/cuda-archive-keyring.gpg" \
-      -o "${kr_path}"
-  elif is_rocky ; then
-    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
-    execute_with_retries "dnf clean all"
-  fi
+function is_complete() {
+  phase="$1"
+  test -f "${workdir}/complete/${phase}"
 }
 
-readonly uname_r=$(uname -r)
-function build_driver_from_github() {
-  if is_ubuntu ; then
-    mok_key=/var/lib/shim-signed/mok/MOK.priv
-    mok_der=/var/lib/shim-signed/mok/MOK.der
-  else
-    mok_key=/var/lib/dkms/mok.key
-    mok_der=/var/lib/dkms/mok.pub
-  fi
-  workdir=/opt/install-nvidia-driver
-  mkdir -p "${workdir}"
-  pushd "${workdir}"
-  test -d "${workdir}/open-gpu-kernel-modules" || {
-    tarball_fn="${DRIVER_VERSION}.tar.gz"
-    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
-      | tar xz
-    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
-  }
-  cd open-gpu-kernel-modules
+function mark_complete() {
+  phase="$1"
+  touch "${workdir}/complete/${phase}"
+}
 
-  time make -j$(nproc) modules \
-    >  /var/log/open-gpu-kernel-modules-build.log \
-    2> /var/log/open-gpu-kernel-modules-build_error.log
-  sync
+function mark_incomplete() {
+  phase="$1"
+  rm -f "${workdir}/complete/${phase}"
+}
 
-  if [[ -n "${PSN}" ]]; then
-    #configure_dkms_certs
-    for module in $(find kernel-open -name '*.ko'); do
-      "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
-      "${mok_key}" \
-      "${mok_der}" \
-      "${module}"
-    done
-    #clear_dkms_key
-  fi
+function install_dependencies() {
+  is_complete install-dependencies && return 0
 
-  make modules_install \
-    >> /var/log/open-gpu-kernel-modules-build.log \
-    2>> /var/log/open-gpu-kernel-modules-build_error.log
-  popd
+  pkg_list="screen"
+  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
+  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
+  mark_complete install-dependencies
 }
 
-function build_driver_from_packages() {
-  if is_debuntu ; then
-    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
-      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
-      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
-    if is_debian ; then
-      pkglist=(
-        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
-        "nvidia-smi=${DRIVER_VERSION}-1"
-        "nvidia-alternative=${DRIVER_VERSION}-1"
-        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
-        "nvidia-kernel-support=${DRIVER_VERSION}-1"
-        "nvidia-modprobe=${DRIVER_VERSION}-1"
-        "libnvidia-ml1=${DRIVER_VERSION}-1"
-      )
-    fi
-    add_contrib_component
-    apt-get update -qq
-    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
-    #configure_dkms_certs
-    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
-    sync
+function prepare_pip_env() {
+  # Clear pip cache
+  # TODO: make this conditional on which OSs have pip without cache purge
+  test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv"
+  source "${workdir}/python-venv/bin/activate"
 
-  elif is_rocky ; then
-    #configure_dkms_certs
-    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
-      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
-    else
-      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
-    fi
-    sync
+  pip cache purge || echo "unable to purge pip cache"
+  if is_ramdisk ; then
+    # Download pip packages to tmpfs
+    mkdir -p "${tmpdir}/cache-dir"
+    pip config set global.cache-dir "${tmpdir}/cache-dir" || echo "unable to set global.cache-dir"
   fi
-  #clear_dkms_key
 }
 
-function install_nvidia_userspace_runfile() {
-  if test -f "${tmpdir}/userspace-complete" ; then return ; fi
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${USERSPACE_URL}" -o "${tmpdir}/userspace.run"
-  execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}"
-  rm -f "${tmpdir}/userspace.run"
-  touch "${tmpdir}/userspace-complete"
-  sync
+function prepare_conda_env() {
+  CONDA=/opt/conda/miniconda3/bin/conda
+  touch ~/.condarc
+  cp ~/.condarc ~/.condarc.default
+  if is_ramdisk ; then
+    # Download conda packages to tmpfs
+    mkdir -p "${tmpdir}/conda_cache"
+    ${CONDA} config --add pkgs_dirs "${tmpdir}/conda_cache"
+  fi
 }
 
-function install_cuda_runfile() {
-  if test -f "${tmpdir}/cuda-complete" ; then return ; fi
-  time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run"
-  execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}"
-  rm -f "${tmpdir}/cuda.run"
-  touch "${tmpdir}/cuda-complete"
-  sync
-}
+function prepare_common_env() {
+  define_os_comparison_functions
 
-function install_cuda_toolkit() {
-  local cudatk_package=cuda-toolkit
-  if ge_debian12 && is_src_os ; then
-    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
-  elif [[ -n "${CUDA_VERSION}" ]]; then
-    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
-  fi
-  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
-  readonly cudatk_package
-  if is_debuntu ; then
-#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
-    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
-    sync
-  elif is_rocky ; then
-    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
-    execute_with_retries dnf -y -q install "${cudatk_package}"
-    sync
-  fi
-}
+  # Verify OS compatability and Secure boot state
+  check_os
+  check_secure_boot
 
-function load_kernel_module() {
-  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
-  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
-  done
+  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
 
-  depmod -a
-  modprobe nvidia
-  for suffix in uvm modeset drm; do
-    modprobe "nvidia-${suffix}"
-  done
-  # TODO: if peermem is available, also modprobe nvidia-peermem
-}
+  # Dataproc configurations
+  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
+  readonly HIVE_CONF_DIR='/etc/hive/conf'
+  readonly SPARK_CONF_DIR='/etc/spark/conf'
 
-# Install NVIDIA GPU driver provided by NVIDIA
-function install_nvidia_gpu_driver() {
-  if ( ge_debian12 && is_src_os ) ; then
-    add_nonfree_components
-    add_repo_nvidia_container_toolkit
-    apt-get update -qq
-    #configure_dkms_certs
-    apt-get -yq install \
-          nvidia-container-toolkit \
-          dkms \
-          nvidia-open-kernel-dkms \
-          nvidia-open-kernel-support \
-          nvidia-smi \
-          libglvnd0 \
-          libcuda1
-    #clear_dkms_key
-  elif ( le_ubuntu18 || le_debian10 || (ge_debian12 && le_cuda11) ) ; then
+  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
+  readonly OS_NAME
 
-    install_nvidia_userspace_runfile
+  # node role
+  ROLE="$(get_metadata_attribute dataproc-role)"
+  readonly ROLE
 
-    build_driver_from_github
+  # master node
+  MASTER="$(get_metadata_attribute dataproc-master)"
+  readonly MASTER
 
-    install_cuda_runfile
-  elif is_debuntu ; then
-    install_cuda_keyring_pkg
+  workdir=/opt/install-dpgce
+  tmpdir=/tmp/
+  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
+  readonly temp_bucket
+  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  uname_r=$(uname -r)
+  readonly uname_r
+  readonly bdcfg="/usr/local/bin/bdconfig"
+  export DEBIAN_FRONTEND=noninteractive
 
-    build_driver_from_packages
+  # Knox config
+  readonly KNOX_HOME=/usr/lib/knox
 
-    install_cuda_toolkit
-  elif is_rocky ; then
-    add_repo_cuda
+  mkdir -p "${workdir}/complete"
+  set_proxy
+  mount_ramdisk
 
-    build_driver_from_packages
+  readonly install_log="${tmpdir}/install.log"
 
-    install_cuda_toolkit
-  else
-    echo "Unsupported OS: '${OS_NAME}'"
-    exit 1
-  fi
-  ldconfig
-  if is_src_os ; then
-    echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully"
+  is_complete prepare.common && return
+
+  repair_old_backports
+
+  if is_debuntu ; then
+    clean_up_sources_lists
+    apt-get update -qq
+    apt-get -y clean
+    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
+    if ge_debian12 ; then
+    apt-mark unhold systemd libsystemd0 ; fi
+    if is_ubuntu ; then
+      while ! command -v gcloud ; do sleep 5s ; done
+    fi
   else
-    echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+    dnf clean all
   fi
+
+  # When creating a disk image:
+  if [[ -n "$(get_metadata_attribute creating-image "")" ]]; then
+    df / > "/run/disk-usage.log"
+
+  # zero free disk space
+  ( set +e
+    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
+  )
+
+    install_dependencies
+
+    # Monitor disk usage in a screen session
+    touch "/run/keep-running-df"
+    screen -d -m -LUS keep-running-df \
+      bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+ fi
+
+  mark_complete prepare.common
 }
 
-# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
-function install_gpu_agent() {
-  if ! command -v pip; then
-    execute_with_retries "apt-get install -y -qq python-pip"
+function pip_exit_handler() {
+  if is_ramdisk ; then
+    # remove the tmpfs pip cache-dir
+    pip config unset global.cache-dir || echo "unable to unset global pip cache"
   fi
-  local install_dir=/opt/gpu-utilization-agent
-  mkdir -p "${install_dir}"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
-  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
-    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
-    | sed -e 's/-u --format=/--format=/' \
-    | dd status=none of="${install_dir}/report_gpu_metrics.py"
-  execute_with_retries pip install -r "${install_dir}/requirements.txt"
-  sync
-
-  # Generate GPU service.
-  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
-[Unit]
-Description=GPU Utilization Metric Agent
-
-[Service]
-Type=simple
-PIDFile=/run/gpu_agent.pid
-ExecStart=/bin/bash --login -c 'python "${install_dir}/report_gpu_metrics.py"'
-User=root
-Group=root
-WorkingDirectory=/
-Restart=always
-
-[Install]
-WantedBy=multi-user.target
-EOF
-  # Reload systemd manager configuration
-  systemctl daemon-reload
-  # Enable gpu-utilization-agent service
-  systemctl --no-reload --now enable gpu-utilization-agent.service
 }
 
-function set_hadoop_property() {
-  local -r config_file=$1
-  local -r property=$2
-  local -r value=$3
-  "${bdcfg}" set_property \
-    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
-    --name "${property}" --value "${value}" \
-    --clobber
+function conda_exit_handler() {
+  mv ~/.condarc.default ~/.condarc
 }
 
-function configure_yarn() {
-  if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+function common_exit_handler() {
+  set +ex
+  echo "Exit handler invoked"
+
+  # If system memory was sufficient to mount memory-backed filesystems
+  if is_ramdisk ; then
+    # Clean up shared memory mounts
+    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
   fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
 
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+  if is_debuntu ; then
+    # Clean up OS package cache
+    apt-get -y -qq clean
+    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
+    # re-hold systemd package
+    if ge_debian12 ; then
+    apt-mark hold systemd libsystemd0 ; fi
+  else
+    dnf clean all
+  fi
 
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
+  # When creating image, print disk usage statistics, zero unused disk space
+  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+    # print disk usage statistics for large components
+    if is_ubuntu ; then
+      du -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
+        /usr/lib \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3 | sort -h
+    elif is_debian ; then
+      du -x -hs \
+        /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
+        /var/lib/{docker,mysql,} \
+        /opt/nvidia/* \
+        /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
+        /usr/bin \
+        /usr \
+        /var \
+        / 2>/dev/null | sort -h
+    else
+      du -hs \
+        /var/lib/docker \
+        /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
+        /usr/lib64/google-cloud-sdk \
+        /opt/nvidia/* \
+        /opt/conda/miniconda3
+    fi
 
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+    # Process disk usage logs from installation period
+    rm -f /run/keep-running-df
+    sync
+    sleep 5.01s
+    # compute maximum size of disk during installation
+    # Log file contains logs like the following (minus the preceeding #):
+#Filesystem     1K-blocks    Used Available Use% Mounted on
+#/dev/vda2        7096908 2611344   4182932  39% /
+    df / | tee -a "/run/disk-usage.log"
 
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
+    perl -e \
+          '@siz=( sort { $a => $b }
+                   map { (split)[2] =~ /^(\d+)/ }
+                  grep { m:^/: } <STDIN> );
+$max=$siz[0]; $min=$siz[-1]; $starting="unknown"; $inc=q{$max-$starting};
+print( "    samples-taken: ", scalar @siz, $/,
+       "starting-disk-used: $starting", $/,
+       "maximum-disk-used:  $max", $/,
+       "minimum-disk-used:  $min", $/,
+       "     increased-by:  $inc", $/ )' < "/run/disk-usage.log"
 
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
 
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+    # zero free disk space
+    dd if=/dev/zero of=/zero
+    sync
+    sleep 3s
+    rm -f /zero
   fi
+  echo "exit_handler has completed"
 }
 
-function configure_gpu_exclusive_mode() {
-  # check if running spark 3, if not, enable GPU exclusive mode
-  local spark_version
-  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
-  if [[ ${spark_version} != 3.* ]]; then
-    # include exclusive mode on GPU
-    nvsmi -c EXCLUSIVE_PROCESS
+
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
   fi
-}
 
-function fetch_mig_scripts() {
-  mkdir -p /usr/local/yarn-mig-scripts
-  sudo chmod 755 /usr/local/yarn-mig-scripts
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  sudo chmod 755 /usr/local/yarn-mig-scripts/*
+  apt-get update -qq
 }
 
-function configure_gpu_script() {
-  # Download GPU discovery script
-  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
-  mkdir -p ${spark_gpu_script_dir}
-  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
-  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
-  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
-  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
-  cat > "${gpus_resources_script}" <<'EOF'
-#!/usr/bin/env bash
-
 #
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
+# Generate repo file under /etc/yum.repos.d/
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+}
+
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
 #
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
 
-ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
+  mkdir -p "$(dirname "${kr_path}")"
 
-echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
-EOF
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
 
-  chmod a+rx "${gpus_resources_script}"
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
 
-  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
-  if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
-    echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
+
+function set_support_matrix() {
+  # CUDA version and Driver version
+  # https://docs.nvidia.com/deploy/cuda-compatibility/
+  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+  # https://developer.nvidia.com/cuda-downloads
+
+  # Minimum supported version for open kernel driver is 515.43.04
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
+  # Rocky8: 12.0: 525.147.05
+  local latest
+  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
+  readonly -A DRIVER_FOR_CUDA=(
+          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
+          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
+  )
+  readonly -A DRIVER_SUBVER=(
+          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
+          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
+  )
+  # https://developer.nvidia.com/cudnn-downloads
+  if is_debuntu ; then
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
+          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
+  )
+  elif is_rocky ; then
+  # rocky:
+  #   12.0: 8.8.1.3
+  #   12.1: 8.9.3.28
+  #   12.2: 8.9.7.29
+  #   12.3: 9.0.0.312
+  #   12.4: 9.1.1.17
+  #   12.5: 9.2.1.18
+  #   12.6: 9.5.1.17
+  readonly -A CUDNN_FOR_CUDA=(
+          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
+          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
+  )
   fi
+  # https://developer.nvidia.com/nccl/nccl-download
+  # 12.2: 2.19.3, 12.5: 2.21.5
+  readonly -A NCCL_FOR_CUDA=(
+          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
+          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
+  )
+  readonly -A CUDA_SUBVER=(
+          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
+          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
+  )
 }
 
-function configure_gpu_isolation() {
-  # enable GPU isolation
-  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
-  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
-    # configure the container-executor.cfg to have major caps
-    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
-    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
-  else
-    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+function set_cuda_version() {
+  case "${DATAPROC_IMAGE_VERSION}" in
+    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
+    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
+    "2.2" ) DEFAULT_CUDA_VERSION="12.6.2" ;;
+    *   )
+      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
+      exit 1
+      ;;
+  esac
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+  if [[ -n "${cuda_url}" ]] ; then
+    # if cuda-url metadata variable has been passed, extract default version from url
+    local CUDA_URL_VERSION
+    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
+      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
+    fi
   fi
+  readonly DEFAULT_CUDA_VERSION
 
-  # Configure a systemd unit to ensure that permissions are set on restart
-  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
-[Unit]
-Description=Set permissions to allow YARN to access device directories
+  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
+  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
+    CUDA_FULL_VERSION="${CUDA_VERSION}"
+    CUDA_VERSION="${CUDA_VERSION%.*}"
+  fi
+  readonly CUDA_VERSION
+  if ( ! test -v CUDA_FULL_VERSION ) ; then
+    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
+  fi
+  readonly CUDA_FULL_VERSION
+}
 
-[Service]
-ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
+function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
+function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
+function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )
 
-[Install]
-WantedBy=multi-user.target
-EOF
+function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
+function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
+function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )
 
-  systemctl enable dataproc-cgroup-device-permissions
-  systemctl start dataproc-cgroup-device-permissions
+function set_driver_version() {
+  local gpu_driver_url
+  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')
+
+  local cuda_url
+  cuda_url=$(get_metadata_attribute 'cuda-url' '')
+
+  local DEFAULT_DRIVER
+  # Take default from gpu-driver-url metadata value
+  if [[ -n "${gpu_driver_url}" ]] ; then
+    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
+    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
+  # Take default from cuda-url metadata value as a backup
+  elif [[ -n "${cuda_url}" ]] ; then
+    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
+    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
+      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
+      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
+      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the version indicated by the cuda url as the default if it exists
+	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
+      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
+        # use the maximum sub-version available for the major version indicated in cuda url as the default
+	DEFAULT_DRIVER="${driver_max_maj_version}"
+      fi
+    fi
+  fi
+
+  if ( ! test -v DEFAULT_DRIVER ) ; then
+    # If a default driver version has not been extracted, use the default for this version of CUDA
+    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
+  fi
+
+  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")
+
+  readonly DRIVER_VERSION
+  readonly DRIVER="${DRIVER_VERSION%%.*}"
+
+  export DRIVER_VERSION DRIVER
+
+  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
+    exit 1
+  fi
 }
 
+function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
+function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
+
 function nvsmi() {
   local nvsmi="/usr/bin/nvidia-smi"
-  if   [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
   elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
   elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
   else nvsmi_works="1" ; fi
 
-  if [[ "$1" == "-L" ]] ; then
+  if test -v 1 && [[ "$1" == "-L" ]] ; then
     local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
     if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
     else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
@@ -1074,394 +931,1289 @@ function nvsmi() {
   "${nvsmi}" $*
 }
 
-function install_dependencies() {
-  if is_debuntu ; then
-    execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" screen
-  elif is_rocky ; then
-    execute_with_retries dnf -y -q install pciutils gcc screen
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
 
-    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    local install_log="${tmpdir}/install.log"
-    set +e
-    eval "${dnf_cmd}" > "${install_log}" 2>&1
-    local retval="$?"
-    set -e
+function query_nvsmi() {
+  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
+}
 
-    if [[ "${retval}" == "0" ]] ; then return ; fi
+function prepare_gpu_env(){
+  set_support_matrix
 
-    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
-      # this kernel-devel may have been migrated to the vault
-      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
-      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
-      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
-        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
-       )"
-    fi
+  set_cuda_version
+  set_driver_version
 
-    execute_with_retries "${dnf_cmd}"
+  set +e
+  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  set -e
+  echo "gpu_count=[${gpu_count}]"
+  nvsmi_works="0"
+  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
+  xmllint="/opt/conda/miniconda3/bin/xmllint"
+  NVIDIA_SMI_PATH='/usr/bin'
+  MIG_MAJOR_CAPS=0
+  IS_MIG_ENABLED=0
+  CUDNN_PKG_NAME=""
+  CUDNN8_PKG_NAME=""
+  CUDA_LOCAL_REPO_INSTALLED="0"
+
+  if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
+    readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
   fi
-}
 
-function main() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn
-
-  # Detect NVIDIA GPU
-  if (lspci | grep -q NVIDIA); then
-    # if this is called without the MIG script then the drivers are not installed
-    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)"
-    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
+  readonly RAPIDS_RUNTIME
 
-    # if mig is enabled drivers would have already been installed
-    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-      install_nvidia_gpu_driver
+  # determine whether we have nvidia-smi installed and working
+  nvsmi
+}
 
-      load_kernel_module
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  if ! is_debuntu ; then return ; fi
 
-      if [[ -n ${CUDNN_VERSION} ]]; then
-        install_nvidia_nccl
-        install_nvidia_cudnn
-      fi
-      #Install GPU metrics collection in Stackdriver if needed
-      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-        install_gpu_agent
-        echo 'GPU metrics agent successfully deployed.'
-      else
-        echo 'GPU metrics agent will not be installed.'
-      fi
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}
 
-      # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
-      for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-        rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
-      done
+function gpu_exit_handler() {
+  echo "no operations in gpu exit handler"
+}
 
-      MIG_GPU_LIST="$(nvsmi -L | grep -e MIG -e P100 -e H100 -e A100 || echo -n "")"
-      if test -n "$(nvsmi -L)" ; then
-	# cache the result of the gpu query
-        ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
-        echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
-      fi
-      NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
-      if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-        # enable MIG on every GPU
-	for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' -e '{print $2}') ; do
-	  nvsmi -i "${GPU_ID}" --multi-instance-gpu 1
-	done
-
-        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-        MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
-        fetch_mig_scripts
-      else
-        configure_gpu_exclusive_mode
-      fi
-    fi
 
-    configure_yarn_nodemanager
-    configure_gpu_script
-    configure_gpu_isolation
-  elif [[ "${ROLE}" == "Master" ]]; then
-    configure_yarn_nodemanager
-    configure_gpu_script
+function set_cudnn_version() {
+  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+  # Parameters for NVIDIA-provided cuDNN library
+  DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly DEFAULT_CUDNN_VERSION
+  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
+    # cuDNN v8 is not distribution for ubuntu20+, debian12
+    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
+    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+    CUDNN_VERSION="8.8.0.121"
   fi
+  readonly CUDNN_VERSION
+}
 
-  # Restart YARN services if they are running already
-  if [[ $(systemctl show hadoop-yarn-resourcemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-resourcemanager.service
-  fi
-  if [[ $(systemctl show hadoop-yarn-nodemanager.service -p SubState --value) == 'running' ]]; then
-    systemctl restart hadoop-yarn-nodemanager.service
+
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+
+function set_cuda_repo_shortname() {
+# Short name for urls
+# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
+  if is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+  else
+    shortname="$(os_id)$(os_vercat)"
   fi
 }
 
-function clean_up_sources_lists() {
-  #
-  # bigtop (primary)
-  #
-  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"
-
-  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
-    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"
+function set_nv_urls() {
+  # Parameters for NVIDIA-provided package repositories
+  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
 
-    local regional_bigtop_repo_uri
-    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
-      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
-      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
-      cut -d ' ' -f 2 |
-      head -1)
+  # Parameter for NVIDIA-provided Rocky Linux GPU driver
+  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
 
-    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
+function set_cuda_runfile_url() {
+  local MAX_DRIVER_VERSION
+  local MAX_CUDA_VERSION
+
+  local MIN_OPEN_DRIVER_VER="515.48.07"
+  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
+  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER
+
+  if is_cuda12 ; then
+    if is_debian12 ; then
+      MIN_DRIVER_VERSION="545.23.06"
+      MIN_CUDA_VERSION="12.3.0"
+    elif is_debian10 ; then
+      MAX_DRIVER_VERSION="555.42.02"
+      MAX_CUDA_VERSION="12.5.0"
+    elif is_ubuntu18 ; then
+      MAX_DRIVER_VERSION="530.30.02"
+      MAX_CUDA_VERSION="12.1.1"
+    fi
+  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    if le_debian10 ; then
+      # cuda 11 is not supported for <= debian10
+      MAX_CUDA_VERSION="0"
+      MAX_DRIVER_VERSION="0"
+    fi
+  else
+    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+
+  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
+    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
+    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
+  fi
+  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
+    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
+    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
+  fi
+
+  # driver version named in cuda runfile filename
+  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
+  readonly -A drv_for_cuda=(
+          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
+          ["11.8.0"]="520.61.05"
+          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
+          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
+          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
+          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
+          ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
+          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
+          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
+  )
+
+  # Verify that the file with the indicated combination exists
+  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
+  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
+  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
+  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"
+
+  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
+  readonly NVIDIA_CUDA_URL
+
+  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
+  readonly CUDA_RUNFILE
+
+  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
+    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
+    exit 1
+  fi
+
+  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
+    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
+  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
+    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
+    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
+  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
+    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
+  fi
+}
+
+function set_cudnn_tarball_url() {
+CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
+CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
+if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
+  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
+  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
+    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
+    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
+  fi
+  # Use legacy url format with one of the tarball name formats depending on version as above
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
+fi
+if ( version_ge "${CUDA_VERSION}" "12.0" ); then
+  # Use modern url format When cuda version is greater than or equal to 12.0
+  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
+  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
+fi
+readonly CUDNN_TARBALL
+readonly CUDNN_TARBALL_URL
+}
+
+function install_cuda_keyring_pkg() {
+  if ( test -v CUDA_KEYRING_PKG_INSTALLED &&
+       [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]] ); then return ; fi
+  local kr_ver=1.1
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
+    -o "${tmpdir}/cuda-keyring.deb"
+  dpkg -i "${tmpdir}/cuda-keyring.deb"
+  rm -f "${tmpdir}/cuda-keyring.deb"
+  CUDA_KEYRING_PKG_INSTALLED="1"
+}
+
+function uninstall_cuda_keyring_pkg() {
+  apt-get purge -yq cuda-keyring
+  CUDA_KEYRING_PKG_INSTALLED="0"
+}
+
+function install_local_cuda_repo() {
+  is_complete install-local-cuda-repo && return
+
+  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
+  CUDA_LOCAL_REPO_INSTALLED="1"
+  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
+  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
+  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
+  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
+  readonly DIST_KEYRING_DIR="/var/${pkgname}"
+
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+
+  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
+  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/
+
+  if is_ubuntu ; then
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
+      -o /etc/apt/preferences.d/cuda-repository-pin-600
+  fi
+
+  mark_complete install-local-cuda-repo
+}
+function uninstall_local_cuda_repo(){
+  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
+  rm -f "${workdir}/complete/install-local-cuda-repo"
+}
+
+function install_local_cudnn_repo() {
+  is_complete install-local-cudnn-repo && return
+
+  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
+  CUDNN_PKG_NAME="${pkgname}"
+  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"
+
+  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
+  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
+    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"
+
+  dpkg -i "${tmpdir}/local-installer.deb"
+
+  rm -f "${tmpdir}/local-installer.deb"
+
+  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
+
+  mark_complete install-local-cudnn-repo
+}
+
+function uninstall_local_cudnn_repo() {
+  apt-get purge -yq "${CUDNN_PKG_NAME}"
+  rm -f "${workdir}/complete/install-local-cudnn-repo"
+}
+
+function install_local_cudnn8_repo() {
+  is_complete install-local-cudnn8-repo && return
+
+  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
+  elif is_debian ; then cudnn8_shortname="debian11"
+  else return 0 ; fi
+  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
+  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
+  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"
+
+  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
+  CUDNN8_PKG_NAME="${pkgname}"
+
+  deb_fn="${pkgname}_1.0-1_amd64.deb"
+  local_deb_fn="${tmpdir}/${deb_fn}"
+  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"
+
+  # cache the cudnn package
+  cache_fetched_package "${local_deb_url}" \
+                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
+                        "${local_deb_fn}"
+
+  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
+  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
+  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
+    mkdir -p "${cudnn_path}"
+    mount -t tmpfs tmpfs "${cudnn_path}"
+  fi
+
+  dpkg -i "${local_deb_fn}"
+
+  rm -f "${local_deb_fn}"
+
+  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
+  mark_complete install-local-cudnn8-repo
+}
+
+function uninstall_local_cudnn8_repo() {
+  apt-get purge -yq "${CUDNN8_PKG_NAME}"
+  mark_incomplete install-local-cudnn8-repo
+}
+
+function install_nvidia_nccl() {
+  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
+
+  is_complete nccl && return
+
+  if is_cuda11 && is_debian12 ; then
+    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
+    return
+  fi
+
+  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"
+
+  # https://github.com/NVIDIA/nccl/blob/master/README.md
+  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+  # Fermi:     SM_20,             compute_30
+  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
+  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
+  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62
+
+  # The following architectures are suppored by open kernel driver
+  # Volta:     SM_70,SM_72,       compute_70,compute_72
+  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87
+
+  # The following architectures are supported by CUDA v11.8+
+  # Ada:       SM_89,             compute_89
+  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
+  # Blackwell: SM_100,            compute_100
+                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
+  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
+  if version_ge "${CUDA_VERSION}" "11.8" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
+  fi
+  if version_ge "${CUDA_VERSION}" "12.0" ; then
+    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
+  fi
+
+  mkdir -p "${workdir}"
+  pushd "${workdir}"
+
+  test -d "${workdir}/nccl" || {
+    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "nccl-${NCCL_VERSION}-1" nccl
+  }
+
+  local build_path
+  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
+                       build_path="nccl/build/pkg/rpm/x86_64" ; fi
+
+  test -d "${workdir}/nccl/build" || {
+    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"
+
+    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
+    if echo "${output}" | grep -q "${gcs_tarball}" ; then
+      # cache hit - unpack from cache
+      echo "cache hit"
     else
-      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
+      # build and cache
+      pushd nccl
+      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
+      install_build_dependencies
+      if is_debuntu ; then
+        # These packages are required to build .deb packages from source
+        execute_with_retries \
+          apt-get install -y -qq build-essential devscripts debhelper fakeroot
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.debian.build
+      elif is_rocky ; then
+        # These packages are required to build .rpm packages from source
+        execute_with_retries \
+          dnf -y -q install rpm-build rpmdevtools
+        export NVCC_GENCODE
+        execute_with_retries make -j$(nproc) pkg.redhat.build
+      fi
+      tar czvf "/${local_tarball}" "../${build_path}"
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
     fi
+    gcloud storage cat "${gcs_tarball}" | tar xz
+  }
 
-    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
-    rm -f "${bigtop_kr_path}"
-    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
-      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"
+  if is_debuntu ; then
+    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
+  elif is_rocky ; then
+    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
+  fi
 
-    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
-    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
+  popd
+  mark_complete nccl
+}
+
+function install_nvidia_cudnn() {
+  is_complete cudnn && return
+
+  local major_version
+  major_version="${CUDNN_VERSION%%.*}"
+  local cudnn_pkg_version
+  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"
+
+  if is_rocky ; then
+    if is_cudnn8 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn${major_version}" \
+        "libcudnn${major_version}-devel"
+      sync
+    elif is_cudnn9 ; then
+      execute_with_retries dnf -y -q install \
+        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
+        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
+      sync
+    else
+      echo "Unsupported cudnn version: '${major_version}'"
+    fi
+  elif is_debuntu; then
+    if ge_debian12 && is_src_os ; then
+      apt-get -y install nvidia-cudnn
+    else
+      if is_cudnn8 ; then
+        install_local_cudnn8_repo
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+            "libcudnn8=${cudnn_pkg_version}" \
+            "libcudnn8-dev=${cudnn_pkg_version}"
+
+        uninstall_local_cudnn8_repo
+	sync
+      elif is_cudnn9 ; then
+	install_cuda_keyring_pkg
+
+        apt-get update -qq
+
+        execute_with_retries \
+          apt-get -y install --no-install-recommends \
+          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
+          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
+	sync
+      else
+        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
+      fi
+    fi
+  else
+    echo "Unsupported OS: '${_shortname}'"
+    exit 1
+  fi
+
+  ldconfig
+
+  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
+  mark_complete cudnn
+}
+
+function add_nonfree_components() {
+  if is_src_nvidia ; then return; fi
+  if ge_debian12 ; then
+      # Include in sources file components on which nvidia-open-kernel-dkms depends
+      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
+      local components="main contrib non-free non-free-firmware"
+
+      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
+  elif is_debian ; then
+      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
+  fi
+}
+
+#
+# Install package signing key and add corresponding repository
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
+function add_repo_nvidia_container_toolkit() {
+  local nvctk_root="https://nvidia.github.io/libnvidia-container"
+  local signing_key_url="${nvctk_root}/gpgkey"
+  local repo_data
+
+  if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
+                  else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" ; fi
+
+  os_add_repo nvidia-container-toolkit \
+              "${signing_key_url}" \
+              "${repo_data}" \
+              "no"
+}
+
+function add_repo_cuda() {
+  if is_debuntu ; then
+    install_cuda_keyring_pkg # 11.7+, 12.0+
+  elif is_rocky ; then
+    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
+  fi
+}
+
+function build_driver_from_github() {
+  # non-GPL driver will have been built on rocky8
+  if is_rocky8 ; then return 0 ; fi
+  pushd "${workdir}"
+
+  test -d "${workdir}/open-gpu-kernel-modules" || {
+    local tarball_fn="${DRIVER_VERSION}.tar.gz"
+    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
+      | tar xz
+    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
+  }
+
+  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+    local local_tarball="${workdir}/${build_tarball}"
+    local def_dir="${modulus_md5sum:-unsigned}"
+    local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
+
+    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+      echo "cache hit"
+    else
+      # build the kernel modules
+      pushd open-gpu-kernel-modules
+      install_build_dependencies
+      if ( is_cuda11 && is_ubuntu22 ) ; then
+        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
+        exit 1
+      fi
+      execute_with_retries make -j$(nproc) modules \
+        >  kernel-open/build.log \
+        2> kernel-open/build_error.log
+      # Sign kernel modules
+      if [[ -n "${PSN}" ]]; then
+        configure_dkms_certs
+        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
+          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
+          "${mok_key}" \
+          "${mok_der}" \
+          "${module}"
+        done
+	clear_dkms_key
+      fi
+      make modules_install \
+        >>  kernel-open/build.log \
+        2>> kernel-open/build_error.log
+      # Collect build logs and installed binaries
+      tar czvf "${local_tarball}" \
+        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+      rm "${local_tarball}"
+      make clean
+      popd
+    fi
+    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+    depmod -a
+  }
+
+  popd
+}
+
+function build_driver_from_packages() {
+  if is_debuntu ; then
+    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
+      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
+      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
+    if is_debian ; then
+      pkglist=(
+        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
+        "nvidia-smi=${DRIVER_VERSION}-1"
+        "nvidia-alternative=${DRIVER_VERSION}-1"
+        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
+        "nvidia-kernel-support=${DRIVER_VERSION}-1"
+        "nvidia-modprobe=${DRIVER_VERSION}-1"
+        "libnvidia-ml1=${DRIVER_VERSION}-1"
+      )
+    fi
+    add_contrib_component
+    apt-get update -qq
+    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
+    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
+    sync
+
+  elif is_rocky ; then
+    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
+      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
+    else
+      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
+    fi
+    sync
+  fi
+}
+
+function install_nvidia_userspace_runfile() {
+  # Parameters for NVIDIA-provided Debian GPU driver
+  readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
+
+  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")
+
+  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
+  readonly USERSPACE_FILENAME
+
+  # This .run file contains NV's OpenGL implementation as well as
+  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
+  # including glib (https://docs.gtk.org/glib/), and what appears to
+  # be a copy of the source from the kernel-open directory of for
+  # example DRIVER_VERSION=560.35.03
+  #
+  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
+  #
+  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
+  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
+  is_complete userspace && return
+
+  local local_fn="${tmpdir}/userspace.run"
+
+  cache_fetched_package "${USERSPACE_URL}" \
+                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
+                        "${local_fn}"
+
+  local runfile_args
+  runfile_args=""
+  local cache_hit="0"
+  local local_tarball
+
+  if is_rocky8 ; then
+    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
+    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
+      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
+      local_tarball="${workdir}/${build_tarball}"
+      local def_dir="${modulus_md5sum:-unsigned}"
+      local build_dir=$(get_metadata_attribute modulus_md5sum "${def_dir}")
+
+      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"
+
+      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
+        cache_hit="1"
+        runfile_args="--no-kernel-modules"
+        echo "cache hit"
+      else
+        install_build_dependencies
+        configure_dkms_certs
+        local signing_options
+        signing_options=""
+        if [[ -n "${PSN}" ]]; then
+          signing_options="--module-signing-hash sha256 \
+          --module-signing-x509-hash sha256 \
+          --module-signing-secret-key \"${mok_key}\" \
+          --module-signing-public-key \"${mok_der}\" \
+          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
+          "
+        fi
+        runfile_args="--no-dkms ${signing_options}"
+      fi
+    }
+  else
+    runfile_args="--no-kernel-modules"
+  fi
+
+  execute_with_retries bash "${local_fn}" -e -q \
+    ${runfile_args} \
+    --ui=none \
+    --install-libglvnd \
+    --tmpdir="${tmpdir}"
+
+  if is_rocky8 ; then
+    if [[ "${cache_hit}" == "1" ]] ; then
+      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
+      depmod -a
+    else
+      clear_dkms_key
+      tar czf "${local_tarball}" \
+        /var/log/nvidia-installer.log \
+        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
+      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
+    fi
+  fi
+
+  rm -f "${local_fn}"
+  mark_complete userspace
+  sync
+}
+
+function install_cuda_runfile() {
+  is_complete cuda && return
+
+  local local_fn="${tmpdir}/cuda.run"
+
+  cache_fetched_package "${NVIDIA_CUDA_URL}" \
+			"${pkg_bucket}/${CUDA_RUNFILE}" \
+                        "${local_fn}"
+
+  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
+  rm -f "${local_fn}"
+  mark_complete cuda
+  sync
+}
+
+function install_cuda_toolkit() {
+  local cudatk_package=cuda-toolkit
+  if ge_debian12 && is_src_os ; then
+    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
+  elif [[ -n "${CUDA_VERSION}" ]]; then
+    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
+  fi
+  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
+  readonly cudatk_package
+  if is_debuntu ; then
+#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
+    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
+  elif is_rocky ; then
+    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
+    execute_with_retries dnf -y -q install "${cudatk_package}"
+  fi
+  sync
+}
+
+function load_kernel_module() {
+  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
+  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
+    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+  done
+
+  depmod -a
+  modprobe nvidia
+  for suffix in uvm modeset drm; do
+    modprobe "nvidia-${suffix}"
+  done
+  # TODO: if peermem is available, also modprobe nvidia-peermem
+}
+
+function install_cuda(){
+  is_complete cuda-repo && return
+
+  if ( ge_debian12 && is_src_os ) ; then
+    echo "installed with the driver on ${_shortname}"
+    return 0
+  fi
+
+  # The OS package distributions are unreliable
+  install_cuda_runfile
+
+  # Includes CUDA packages
+  add_repo_cuda
+
+  mark_complete cuda-repo
+}
+
+function install_nvidia_container_toolkit() {
+  is_complete install-nvtk && return
+
+  local container_runtime_default
+    if command -v docker     ; then container_runtime_default='docker'
+  elif command -v containerd ; then container_runtime_default='containerd'
+  elif command -v crio       ; then container_runtime_default='crio'
+                               else container_runtime_default='' ; fi
+  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")
+
+  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi
+
+  add_repo_nvidia_container_toolkit
+  if is_debuntu ; then
+    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
+    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
+  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
+  systemctl restart "${CONTAINER_RUNTIME}"
+
+  mark_complete install-nvtk
+}
+
+# Install NVIDIA GPU driver provided by NVIDIA
+function install_nvidia_gpu_driver() {
+  is_complete gpu-driver && return
+
+  if ( ge_debian12 && is_src_os ) ; then
+    add_nonfree_components
+    apt-get update -qq
+    apt-get -yq install \
+        dkms \
+        nvidia-open-kernel-dkms \
+        nvidia-open-kernel-support \
+        nvidia-smi \
+        libglvnd0 \
+        libcuda1
+    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
+    return 0
+  fi
+
+  # OS driver packages do not produce reliable driver ; use runfile
+  install_nvidia_userspace_runfile
+
+  build_driver_from_github
+
+  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
+  mark_complete gpu-driver
+}
+
+function install_ops_agent(){
+  is_complete ops-agent && return
+
+  mkdir -p /opt/google
+  cd /opt/google
+  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
+  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
+
+  is_complete ops-agent
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_monitoring_agent() {
+  download_gpu_monitoring_agent
+  install_gpu_monitoring_agent_dependency
+  start_gpu_monitoring_agent_service
+}
+
+function download_gpu_monitoring_agent(){
+  if is_rocky ; then
+    execute_with_retries "dnf -y -q install git"
+  else
+    execute_with_retries "apt-get install git -y"
+  fi
+  mkdir -p /opt/google
+  chmod 777 /opt/google
+  cd /opt/google
+  test -d compute-gpu-monitoring || \
+    execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
+}
+
+function install_gpu_monitoring_agent_dependency(){
+  cd /opt/google/compute-gpu-monitoring/linux
+  /opt/conda/miniconda3/bin/python3 -m venv venv
+  (
+    source venv/bin/activate
+    pip install wheel
+    pip install -Ur requirements.txt
+  )
+}
+
+function start_gpu_monitoring_agent_service(){
+  cp /opt/google/compute-gpu-monitoring/linux/systemd/google_gpu_monitoring_agent_venv.service /lib/systemd/system
+  systemctl daemon-reload
+  systemctl --no-reload --now enable /lib/systemd/system/google_gpu_monitoring_agent_venv.service
+}
+
+# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
+function install_gpu_agent() {
+  # Stackdriver GPU agent parameters
+#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
+  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
+  if ( ! command -v pip && is_debuntu ) ; then
+    execute_with_retries "apt-get install -y -qq python3-pip"
+  fi
+  local install_dir=/opt/gpu-utilization-agent
+  mkdir -p "${install_dir}"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
+    | sed -e 's/-u --format=/--format=/' \
+    | dd status=none of="${install_dir}/report_gpu_metrics.py"
+  local venv="${install_dir}/venv"
+  /opt/conda/miniconda3/bin/python3 -m venv "${venv}"
+(
+  source "${venv}/bin/activate"
+  python3 -m pip install --upgrade pip
+  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
+)
+  sync
+
+  # Generate GPU service.
+  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
+[Unit]
+Description=GPU Utilization Metric Agent
+
+[Service]
+Type=simple
+PIDFile=/run/gpu_agent.pid
+ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
+User=root
+Group=root
+WorkingDirectory=/
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+EOF
+  # Reload systemd manager configuration
+  systemctl daemon-reload
+  # Enable gpu-utilization-agent service
+  systemctl --no-reload --now enable gpu-utilization-agent.service
+}
+
+function configure_gpu_exclusive_mode() {
+  # only run this function when spark < 3.0
+  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
+  # include exclusive mode on GPU
+  nvsmi -c EXCLUSIVE_PROCESS
+  clear_nvsmi_cache
+}
+
+function install_build_dependencies() {
+  is_complete build-dependencies && return
+
+  if is_debuntu ; then
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
+  elif is_rocky ; then
+    execute_with_retries dnf -y -q install gcc
+
+    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
+      # this kernel-devel may have been migrated to the vault
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
+       )"
+    fi
+
+    execute_with_retries "${dnf_cmd}"
   fi
+  mark_complete build-dependencies
+}
 
-  #
-  # adoptium
-  #
-  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
-  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
-  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
-  rm -f "${adoptium_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
-   | gpg --dearmor -o "${adoptium_kr_path}"
-  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
-   > /etc/apt/sources.list.d/adoptium.list
+function install_gpu_driver_and_cuda() {
+  install_nvidia_gpu_driver
+  install_cuda
+  load_kernel_module
+}
 
+function prepare_gpu_install_env() {
+  # Whether to install NVIDIA-provided or OS-provided GPU driver
+  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+  readonly GPU_DRIVER_PROVIDER
 
-  #
-  # docker
-  #
-  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
-  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
-  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"
+  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+  readonly INSTALL_GPU_AGENT
 
-  rm -f "${docker_kr_path}"
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
-    | gpg --dearmor -o "${docker_kr_path}"
-  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
-    > ${docker_repo_file}
+  set_cuda_repo_shortname
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
 
-  #
-  # google cloud + logging/monitoring
-  #
-  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
-    rm -f /usr/share/keyrings/cloud.google.gpg
-    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
-    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
-      list_file="/etc/apt/sources.list.d/${list}.list"
-      if [[ -f "${list_file}" ]]; then
-        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
+
+function gpu_install_exit_handler() {
+  if is_ramdisk ; then
+    for shmdir in /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
       fi
     done
   fi
+  hold_nvidia_packages
+}
 
-  #
-  # cran-r
-  #
-  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
-    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
-    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
-    rm -f /usr/share/keyrings/cran-r.gpg
-    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
-      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
-    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
+
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
+
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
+
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
   fi
+}
 
-  #
-  # mysql
-  #
-  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
-    rm -f /usr/share/keyrings/mysql.gpg
-    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
-      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
-    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
+function yarn_exit_handler() {
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
+    fi
+  done
+  # restart services stopped during preparation stage
+  # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+}
+
+
+function configure_yarn_gpu_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
   fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
 
-  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
 
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
 }
 
-function exit_handler() {
-  set +ex
-  echo "Exit handler invoked"
+function configure_gpu_script() {
+  # Download GPU discovery script
+  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
+  mkdir -p ${spark_gpu_script_dir}
+  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
+  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
+  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
+  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
+  cat > "${gpus_resources_script}" <<'EOF'
+#!/usr/bin/env bash
 
-  # Purge private key material until next grant
-  clear_dkms_key
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}
 
-  # Clear pip cache
-  pip cache purge || echo "unable to purge pip cache"
+ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
 
-  # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    # remove the tmpfs pip cache-dir
-    pip config unset global.cache-dir || echo "unable to unset global pip cache"
+echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
+EOF
 
-    # Clean up shared memory mounts
-    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
-      if grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ; then
-        umount -f ${shmdir}
-      fi
-    done
+  chmod a+rx "${gpus_resources_script}"
 
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
-  fi
+  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
 
-  if is_debuntu ; then
-    # Clean up OS package cache
-    apt-get -y -qq clean
-    apt-get -y -qq autoremove
-    # re-hold systemd package
-    if ge_debian12 ; then
-    apt-mark hold systemd libsystemd0 ; fi
+  local executor_cores
+  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
+  local executor_memory
+  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
+  local task_cpus=2
+  local gpu_amount
+
+  # The current setting of spark.task.resource.gpu.amount (0.333) is
+  # not ideal to get the best performance from the RAPIDS Accelerator
+  # plugin. It's recommended to be 1/{executor core count} unless you
+  # have a special use case.
+#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
+  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"
+
+# cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
+
+  cat >>"${spark_defaults_conf}" <<EOF
+###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
+# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
+# query explain output won't show GPU operator, if the user has doubts
+# they can uncomment the line before seeing the GPU plan explain;
+# having AQE enabled gives user the best performance.
+spark.executor.resource.gpu.amount=${gpu_count}
+spark.plugins=com.nvidia.spark.SQLPlugin
+spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
+spark.executor.cores=${executor_cores}
+spark.executor.memory=${executor_memory_gb}G
+spark.dynamicAllocation.enabled=false
+# please update this config according to your application
+spark.task.resource.gpu.amount=${gpu_amount}
+spark.task.cpus=2
+spark.yarn.unmanagedAM.enabled=false
+###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
+EOF
+}
+
+function configure_yarn_nodemanager_gpu() {
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
+  configure_yarn_nodemanager
+}
+
+function configure_gpu_isolation() {
+  # enable GPU isolation
+  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
+  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
+    # configure the container-executor.cfg to have major caps
+    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
+    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
+    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
   else
-    dnf clean all
+    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
   fi
 
-  # print disk usage statistics for large components
-  if is_ubuntu ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  elif is_debian ; then
-    du -hs \
-      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
-      /usr/lib \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3 | sort -h
-  else
-    du -hs \
-      /var/lib/docker \
-      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
-      /usr/lib64/google-cloud-sdk \
-      /usr/lib \
-      /opt/nvidia/* \
-      /usr/local/cuda-1?.? \
-      /opt/conda/miniconda3
-  fi
-
-  # Process disk usage logs from installation period
-  rm -f /run/keep-running-df
-  sync
-  sleep 5.01s
-  # compute maximum size of disk during installation
-  # Log file contains logs like the following (minus the preceeding #):
-#Filesystem     1K-blocks    Used Available Use% Mounted on
-#/dev/vda2        7096908 2611344   4182932  39% /
-  df / | tee -a "/run/disk-usage.log"
+  # Configure a systemd unit to ensure that permissions are set on restart
+  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
+[Unit]
+Description=Set permissions to allow YARN to access device directories
 
-  perl -e '@siz=( sort { $a => $b }
-                   map { (split)[2] =~ /^(\d+)/ }
-                  grep { m:^/: } <STDIN> );
-$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
-print( "    samples-taken: ", scalar @siz, $/,
-       "maximum-disk-used: $max", $/,
-       "minimum-disk-used: $min", $/,
-       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"
+[Service]
+ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"
 
-  echo "exit_handler has completed"
+[Install]
+WantedBy=multi-user.target
+EOF
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
-    dd if=/dev/zero of=/zero
-    sync
-    sleep 3s
-    rm -f /zero
+  systemctl enable dataproc-cgroup-device-permissions
+  systemctl start dataproc-cgroup-device-permissions
+}
+
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_gpu_resources
+
+  # When there is no GPU, but the installer is executing on a master node:
+  if [[ "${gpu_count}" == "0" ]] ; then
+    if [[ "${ROLE}" == "Master" ]]; then
+      configure_yarn_nodemanager
+    fi
+    return 0
   fi
 
-  return 0
+  install_nvidia_container_toolkit
+  configure_yarn_nodemanager_gpu
+  configure_gpu_script
+  configure_gpu_isolation
 }
 
-function set_proxy(){
-  export METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy)"
-  export http_proxy="${METADATA_HTTP_PROXY}"
-  export https_proxy="${METADATA_HTTP_PROXY}"
-  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
-  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
-  export no_proxy=metadata.google.internal,169.254.169.254
-  export NO_PROXY=metadata.google.internal,169.254.169.254
-}
 
-function mount_ramdisk(){
-  local free_mem
-  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+function install_spark_rapids() {
+  # Update SPARK RAPIDS config
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
-  # Write to a ramdisk instead of churning the persistent disk
+  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+  local -r scala_ver="2.12"
 
-  tmpdir="/mnt/shm"
-  mkdir -p "${tmpdir}"
-  mount -t tmpfs tmpfs "${tmpdir}"
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+  fi
 
-  # Clear pip cache
-  # TODO: make this conditional on which OSs have pip without cache purge
-  pip cache purge || echo "unable to purge pip cache"
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
-  # Download pip packages to tmpfs
-  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
 
-  # Download OS packages to tmpfs
-  if is_debuntu ; then
-    mount -t tmpfs tmpfs /var/cache/apt/archives
-  else
-    mount -t tmpfs tmpfs /var/cache/dnf
-  fi
-}
+  local jar_basename
 
-function prepare_to_install(){
-  nvsmi_works="0"
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  tmpdir=/tmp/
-  if ! is_debuntu && ! is_rocky ; then
-    echo "Unsupported OS: '$(os_name)'"
-    exit 1
-  fi
+  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 
-  repair_old_backports
+  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 
-  export DEBIAN_FRONTEND=noninteractive
+  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
+}
 
-  trap exit_handler EXIT
-  mount_ramdisk
-  install_log="${tmpdir}/install.log"
 
-  set_proxy
+function main() {
+  install_gpu_driver_and_cuda
 
-  if is_debuntu ; then
-    clean_up_sources_lists
-    apt-get update -qq
-    apt-get -y clean
-    sleep 5s
-    apt-get -y -qq autoremove
-    if ge_debian12 ; then
-    apt-mark unhold systemd libsystemd0 ; fi
+  #Install GPU metrics collection in Stackdriver if needed
+  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+    install_gpu_agent
+#    install_gpu_monitoring_agent
+    echo 'GPU metrics agent successfully deployed.'
   else
-    dnf clean all
+    echo 'GPU metrics agent has not been installed.'
   fi
+  configure_gpu_exclusive_mode
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
-    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
-  ) fi
+  setup_gpu_yarn
+
+  echo "yarn setup complete"
+
+  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
+    install_nvidia_nccl
+    install_nvidia_cudnn
+  fi
+
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    install_spark_rapids
+    configure_gpu_script
+    echo "RAPIDS initialized with Spark runtime"
+  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
+    # we are not currently tooled for installing dask in this action.
+    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
+  else
+    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
+  fi
 
-  configure_dkms_certs
+  echo "main complete"
+  return 0
+}
 
-  install_dependencies
+function exit_handler() {
+  set +e
+  gpu_install_exit_handler
+  gpu_exit_handler
+  pip_exit_handler
+  yarn_exit_handler
+  common_exit_handler
+  return 0
+}
 
-  # Monitor disk usage in a screen session
-  df / > "/run/disk-usage.log"
-  touch "/run/keep-running-df"
-  screen -d -m -US keep-running-df \
-    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
+function prepare_to_install(){
+  prepare_common_env
+  prepare_pip_env
+  prepare_gpu_env
+  prepare_gpu_install_env
+  trap exit_handler EXIT
 }
 
 prepare_to_install

From 10ceea0e0d72b520c032eae7d66a769cbf46ec6e Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 21:21:47 -0800
Subject: [PATCH 11/15] printing the time of the connection failure before
 retrying

---
 templates/dask/util_functions | 1 +
 1 file changed, 1 insertion(+)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index ce6964e94..17c17479c 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -150,6 +150,7 @@ function start_systemd_dask_service() {
     # Pause while scheduler comes online
     retries=30
     while ! nc -vz "${MASTER}" 8786 ; do
+      date
       sleep 3s
       ((retries--))
       if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi

From 24623a672db9f1fc5392c666a2e8fdd2d1cb055f Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 21:31:41 -0800
Subject: [PATCH 12/15] provide more leeway for slow dask scheduler startup

---
 templates/dask/util_functions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/dask/util_functions b/templates/dask/util_functions
index 17c17479c..f91c195db 100644
--- a/templates/dask/util_functions
+++ b/templates/dask/util_functions
@@ -151,7 +151,7 @@ function start_systemd_dask_service() {
     retries=30
     while ! nc -vz "${MASTER}" 8786 ; do
       date
-      sleep 3s
+      sleep 7s
       ((retries--))
       if [[ "${retries}" == "0" ]] ; then echo "dask scheduler unreachable" ; exit 1 ; fi
     done

From 22318dde4a37eefbb204efd9c8f560d03fe7d16b Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 21:51:32 -0800
Subject: [PATCH 13/15] correct skips

---
 gpu/test_gpu.py | 78 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 25 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f8438915f..e83699e7e 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -64,8 +64,14 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -94,8 +100,12 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
 
     self.skipTest("No need to regularly test not installing the agent")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=false"
     if driver_provider is not None:
@@ -121,8 +131,13 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -159,15 +174,22 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
       self.skipTest("CUDA == 12.0 not supported on debian 12")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -236,12 +258,13 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   )
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -270,16 +293,21 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
+    and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
+          ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \

From 138e26cfb1b9fb3c711a8869dbaad7143af764c5 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Tue, 7 Jan 2025 22:14:28 -0800
Subject: [PATCH 14/15] removed expectedFailure calls since unittest import was
 removed

---
 gpu/test_gpu.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index e83699e7e..395ddff0f 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -70,7 +70,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = None
@@ -104,7 +103,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=false"
@@ -136,7 +134,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
@@ -187,7 +184,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
 
@@ -263,7 +259,6 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = None
@@ -306,7 +301,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \

From 1e702a6d9499d174355bcbfcfe79f98eeb2f6963 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Wed, 8 Jan 2025 00:04:22 -0800
Subject: [PATCH 15/15] increase CPU count on slow cluster create to help
 ubuntu18 through the finish line

---
 gpu/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 395ddff0f..8237359e4 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -269,7 +269,7 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         configuration,
         self.INIT_ACTIONS,
         metadata=metadata,
-        machine_type="n1-highmem-8",
+        machine_type="n1-standard-32",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         boot_disk_size="50GB",