From 554b086238a5223045bfd83ad06a7ee3fa8e7d7e Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 31 Oct 2024 10:16:47 -0700 Subject: [PATCH 1/3] refactored dask and rapids to more closely match --- rapids/rapids.sh | 148 ++++++++++++++++++++++++++---------------- rapids/test_rapids.py | 88 ++++++++++++++++++------- 2 files changed, 158 insertions(+), 78 deletions(-) diff --git a/rapids/rapids.sh b/rapids/rapids.sh index 6c5c9d411..09c46f3c5 100644 --- a/rapids/rapids.sh +++ b/rapids/rapids.sh @@ -19,11 +19,12 @@ set -euxo pipefail -function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } -function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } -function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } -function is_debuntu() { is_debian || is_ubuntu ; } +function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function is_ubuntu() ( set +x ; [[ "$(os_id)" == 'ubuntu' ]] ; ) +function is_ubuntu18() ( set +x ; is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; ) +function is_debian() ( set +x ; [[ "$(os_id)" == 'debian' ]] ; ) +function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) function print_metadata_value() { local readonly tmpfile=$(mktemp) @@ -72,17 +73,6 @@ function get_metadata_attribute() ( function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } -function execute_with_retries() { - local -r cmd="$*" - for i in {0..9} ; do - if eval "$cmd"; then - return 0 ; fi - sleep 5 - done - echo "Cmd '${cmd}' failed." - return 1 -} - function configure_dask_yarn() { readonly DASK_YARN_CONFIG_DIR=/etc/dask/ readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml @@ -469,8 +459,7 @@ function install_dask_rapids() { ( set +e local is_installed="0" for installer in "${mamba}" "${conda}" ; do - test -d "${DASK_CONDA_ENV}" || \ - time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ + time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ ${CONDA_PACKAGES[*]} \ "${python_spec}" \ @@ -479,8 +468,10 @@ function install_dask_rapids() { if [[ "$retval" == "0" ]] ; then is_installed="1" break + else + test -d "${DASK_CONDA_ENV}" && ( "${conda}" remove -n 'dask-rapids' --all || rm -rf "${DASK_CONDA_ENV}" ) + "${conda}" config --set channel_priority flexible fi - "${conda}" config --set channel_priority flexible done if [[ "${is_installed}" == "0" ]]; then echo "failed to install dask" @@ -533,8 +524,8 @@ function main() { fi } -function exit_handler() ( - set +e +function exit_handler() { + set +ex echo "Exit handler invoked" # Free conda cache @@ -543,16 +534,29 @@ function exit_handler() ( # Clear pip cache pip cache purge || echo "unable to purge pip cache" - # remove the tmpfs conda pkgs_dirs - if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # Stop hadoop services + systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' - # Clean up shared memory mounts - for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do - if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then - rm -rf ${shmdir}/* - umount -f ${shmdir} - fi - done + # remove the tmpfs conda pkgs_dirs + /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config" + + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do + if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then + sync + sleep 3s + execute_with_retries umount -f ${shmdir} + fi + done + + umount -f /tmp + systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi # Clean up OS package cache ; re-hold systemd package if is_debuntu ; then @@ -562,36 +566,64 @@ function exit_handler() ( dnf clean all fi - # print disk usage statistics - if is_debuntu ; then - # Rocky doesn't have sort -h and fails when the argument is passed - du --max-depth 3 -hx / | sort -h | tail -10 + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + elif is_debian ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 fi # Process disk usage logs from installation period - rm -f "${tmpdir}/keep-running-df" - sleep 6s + rm -f /run/keep-running-df + sync + sleep 5.01s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): -#Filesystem Size Used Avail Use% Mounted on -#/dev/vda2 6.8G 2.5G 4.0G 39% / - df -h / | tee -a "${tmpdir}/disk-usage.log" - perl -e '$max=( sort +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } - grep { m:^/: } )[-1]; -print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log" + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" echo "exit_handler has completed" # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero ; sync ; rm -f /zero + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero fi return 0 -) +} -function prepare_to_install(){ +function prepare_to_install() { readonly DEFAULT_CUDA_VERSION="12.4" CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) readonly CUDA_VERSION @@ -599,13 +631,6 @@ function prepare_to_install(){ readonly ROLE=$(get_metadata_attribute dataproc-role) readonly MASTER=$(get_metadata_attribute dataproc-master) - # RAPIDS config - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') - readonly RAPIDS_RUNTIME - - readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" - readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) - # Dask config DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" readonly DASK_RUNTIME @@ -620,9 +645,16 @@ function prepare_to_install(){ readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" enable_worker_service="0" + # RAPIDS config + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') + readonly RAPIDS_RUNTIME + + readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" + readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk - if [[ ${free_mem} -ge 5250000 ]]; then + if [[ ${free_mem} -ge 10500000 ]]; then tmpdir=/mnt/shm mkdir -p /mnt/shm mount -t tmpfs tmpfs /mnt/shm @@ -638,6 +670,7 @@ function prepare_to_install(){ if is_debuntu ; then mount -t tmpfs tmpfs /var/cache/apt/archives else + while [[ -f /var/cache/dnf/metadata_lock.pid ]] ; do sleep 1s ; done mount -t tmpfs tmpfs /var/cache/dnf fi else @@ -646,16 +679,19 @@ function prepare_to_install(){ install_log="${tmpdir}/install.log" trap exit_handler EXIT + # Clean conda cache + /opt/conda/miniconda3/bin/conda clean -a + # Monitor disk usage in a screen session if is_debuntu ; then apt-get install -y -qq screen else dnf -y -q install screen fi - df -h / | tee "${tmpdir}/disk-usage.log" - touch "${tmpdir}/keep-running-df" + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" screen -d -m -US keep-running-df \ - bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done" + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" } prepare_to_install diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 63fa72a7f..9f8da2f2b 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -12,6 +12,7 @@ class RapidsTestCase(DataprocTestCase): INIT_ACTIONS = [ "gpu/install_gpu_driver.sh", "rapids/rapids.sh" ] + INTERPRETER = '/opt/conda/miniconda3/envs/dask-rapids/bin/python' GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=2" @@ -20,39 +21,73 @@ class RapidsTestCase(DataprocTestCase): # Tests for RAPIDS init action DASK_RAPIDS_TEST_SCRIPT_FILE_NAME = "verify_rapids_dask.py" + DASK_YARN_TEST_SCRIPT = 'verify_dask_yarn.py' + DASK_STANDALONE_TEST_SCRIPT = 'verify_dask_standalone.py' + + def verify_dask_yarn(self, name): + self._run_dask_test_script(name, self.DASK_YARN_TEST_SCRIPT) + + def verify_dask_standalone(self, name, master_hostname): + script=self.DASK_STANDALONE_TEST_SCRIPT + verify_cmd = "{} {} {}".format( + INTERPRETER, + script, + master_hostname + ) + abspath=os.path.join(os.path.dirname(os.path.abspath(__file__)),script) + self.upload_test_file(abspath, name) + self.assert_instance_command(name, verify_cmd) + self.remove_test_script(script, name) + + def _run_dask_test_script(self, name, script): + verify_cmd = "{} {}".format( + INTERPRETER, + script) + self.upload_test_file( + os.path.join(os.path.dirname(os.path.abspath(__file__)), + script), name) + command_asserted=0 + for try_number in range(0, 3): + try: + self.assert_instance_command(name, verify_cmd) + command_asserted=1 + break + except: + time.sleep(2**try_number) + if command_asserted == 0: + raise Exception("Unable to assert instance command [{}]".format(verify_cmd)) + + self.remove_test_script(script, name) + def verify_dask_worker_service(self, name): + verify_cmd = "[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]" # Retry the first ssh to ensure it has enough time to propagate SSH keys + command_asserted=0 for try_number in range(0, 3): try: - self.assert_instance_command( - name, "[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]") + self.assert_instance_command(name, verify_cmd) + command_asserted=1 break except: time.sleep(2**try_number) + if command_asserted == 0: + raise Exception("Unable to assert instance command [{}]".format(verify_cmd)) def verify_dask_config(self, name): self.assert_instance_command( name, "grep 'class: \"dask_cuda.CUDAWorker\"' /etc/dask/config.yaml") - def run_dask_script(self, name): - test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), - self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) - self.upload_test_file(test_filename, name) - verify_cmd = "/opt/conda/miniconda3/envs/dask-rapids/bin/python {}".format( - self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) - self.assert_instance_command(name, verify_cmd) - self.remove_test_script(self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME, name) - - @parameterized.parameters( -# If a new version of dask-yarn is released, add this test back in. -# ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), -# ("STANDARD", ["m"], GPU_T4, None), + ("STANDARD", ["m", "w-0"], GPU_T4, "yarn"), + ("STANDARD", ["m"], GPU_T4, None), ("STANDARD", ["m", "w-0"], GPU_T4, "standalone") ) - def test_rapids_dask(self, configuration, machine_suffixes, accelerator, + def test_dask(self, configuration, machine_suffixes, accelerator, dask_runtime): + if self.getImageVersion() < pkg_resources.parse_version("2.0"): + self.skipTest("Not supported in pre-2.0 images") + metadata = "gpu-driver-provider=NVIDIA,rapids-runtime=DASK" if dask_runtime: metadata += ",dask-runtime={}".format(dask_runtime) @@ -61,21 +96,30 @@ def test_rapids_dask(self, configuration, machine_suffixes, accelerator, configuration, self.INIT_ACTIONS, metadata=metadata, - machine_type="n1-standard-8", + machine_type='n1-standard-8', master_accelerator=accelerator, worker_accelerator=accelerator, - boot_disk_size="50GB", - timeout_in_minutes=60) + timeout_in_minutes=20 + ) + + c_name=self.getClusterName() + if configuration == 'HA': + master_hostname = c_name + '-m-0' + else: + master_hostname = c_name + '-m' for machine_suffix in machine_suffixes: - machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + machine_name="{}-{}".format(c_name,machine_suffix) + if dask_runtime == 'standalone' or dask_runtime == None: self.verify_dask_worker_service(machine_name) + self.verify_dask_standalone(machine_name, master_hostname) elif dask_runtime == 'yarn': self.verify_dask_config(machine_name) + self._run_dask_test_script(name, self.DASK_YARN_TEST_SCRIPT) - self.run_dask_script(machine_name) + self._run_dask_test_script(machine_name, self.DASK_RAPIDS_TEST_SCRIPT_FILE_NAME) -if __name__ == "__main__": +if __name__ == '__main__': absltest.main() From 2897d9e909a8952cbb96636a9098d0d8f35d378d Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 31 Oct 2024 13:05:02 -0700 Subject: [PATCH 2/3] refactored to reduce code and improve debugging messages --- rapids/test_rapids.py | 47 ++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/rapids/test_rapids.py b/rapids/test_rapids.py index 9f8da2f2b..6df16c84a 100644 --- a/rapids/test_rapids.py +++ b/rapids/test_rapids.py @@ -29,49 +29,32 @@ def verify_dask_yarn(self, name): def verify_dask_standalone(self, name, master_hostname): script=self.DASK_STANDALONE_TEST_SCRIPT - verify_cmd = "{} {} {}".format( - INTERPRETER, - script, - master_hostname - ) - abspath=os.path.join(os.path.dirname(os.path.abspath(__file__)),script) - self.upload_test_file(abspath, name) - self.assert_instance_command(name, verify_cmd) - self.remove_test_script(script, name) + script_and_arg="{} {}".format( script, master_hostname ) + self._run_dask_test_script(name, script_and_arg) def _run_dask_test_script(self, name, script): - verify_cmd = "{} {}".format( - INTERPRETER, - script) - self.upload_test_file( - os.path.join(os.path.dirname(os.path.abspath(__file__)), - script), name) - command_asserted=0 - for try_number in range(0, 3): - try: - self.assert_instance_command(name, verify_cmd) - command_asserted=1 - break - except: - time.sleep(2**try_number) - if command_asserted == 0: - raise Exception("Unable to assert instance command [{}]".format(verify_cmd)) - + verify_cmd = "{} {}".format( INTERPRETER, script ) + abspath=os.path.join(os.path.dirname(os.path.abspath(__file__)),script) + self.upload_test_file(abspath, name) + self.retry_assert_instance_command( name, verify_cmd ) self.remove_test_script(script, name) - def verify_dask_worker_service(self, name): - verify_cmd = "[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]" - # Retry the first ssh to ensure it has enough time to propagate SSH keys + def retry_assert_instance_command(self, name, verify_cmd): command_asserted=0 for try_number in range(0, 3): try: self.assert_instance_command(name, verify_cmd) command_asserted=1 break - except: + except Exception as err: + print('command failed with exception «{}»'.format(err)) time.sleep(2**try_number) if command_asserted == 0: - raise Exception("Unable to assert instance command [{}]".format(verify_cmd)) + raise Exception("Unable to assert instance command «{}»".format(verify_cmd)) + + def verify_dask_worker_service(self, name): + verify_cmd = "[[ X$(systemctl show dask-worker -p SubState --value)X == XrunningX ]]" + self.retry_assert_instance_command( name, verify_cmd ) def verify_dask_config(self, name): self.assert_instance_command( @@ -99,7 +82,7 @@ def test_dask(self, configuration, machine_suffixes, accelerator, machine_type='n1-standard-8', master_accelerator=accelerator, worker_accelerator=accelerator, - timeout_in_minutes=20 + timeout_in_minutes=40 ) c_name=self.getClusterName() From 710588ddff1277bb121532660e22a28045856585 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 8 Nov 2024 18:39:36 -0800 Subject: [PATCH 3/3] these have been tested to work for the most part --- gpu/install_gpu_driver.sh | 411 ++++++++++++++++++------ integration_tests/dataproc_test_case.py | 10 +- 2 files changed, 309 insertions(+), 112 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 10b1aa061..7add73b0f 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -16,24 +16,26 @@ set -euxo pipefail -function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function os_version() { grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; } -function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; } -function is_rocky() { [[ "$(os_id)" == 'rocky' ]] ; } -function is_rocky8() { is_rocky && [[ "$(os_version)" == '8'* ]] ; } -function is_rocky9() { is_rocky && [[ "$(os_version)" == '9'* ]] ; } -function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } -function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } -function is_ubuntu20() { is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; } -function is_ubuntu22() { is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; } -function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } -function is_debian10() { is_debian && [[ "$(os_version)" == '10'* ]] ; } -function is_debian11() { is_debian && [[ "$(os_version)" == '11'* ]] ; } -function is_debian12() { is_debian && [[ "$(os_version)" == '12'* ]] ; } -function os_vercat() { set +x +function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) +function is_rocky() ( set +x ; [[ "$(os_id)" == 'rocky' ]] ; ) +function is_rocky8() ( set +x ; is_rocky && [[ "$(os_version)" == '8'* ]] ; ) +function is_rocky9() ( set +x ; is_rocky && [[ "$(os_version)" == '9'* ]] ; ) +function is_ubuntu() ( set +x ; [[ "$(os_id)" == 'ubuntu' ]] ; ) +function is_ubuntu18() ( set +x ; is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; ) +function is_ubuntu20() ( set +x ; is_ubuntu && [[ "$(os_version)" == '20.04'* ]] ; ) +function is_ubuntu22() ( set +x ; is_ubuntu && [[ "$(os_version)" == '22.04'* ]] ; ) +function is_debian() ( set +x ; [[ "$(os_id)" == 'debian' ]] ; ) +function is_debian10() ( set +x ; is_debian && [[ "$(os_version)" == '10'* ]] ; ) +function is_debian11() ( set +x ; is_debian && [[ "$(os_version)" == '11'* ]] ; ) +function is_debian12() ( set +x ; is_debian && [[ "$(os_version)" == '12'* ]] ; ) +function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) + +function os_vercat() ( set +x if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' - else os_version ; fi ; set -x ; } + else os_version ; fi ; ) function remove_old_backports { if is_debian12 ; then return ; fi @@ -56,11 +58,13 @@ function remove_old_backports { done } +# Return true if the first argument is equal to or less than the second argument function compare_versions_lte { [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; } -function compare_versions_lt() { +# Return true if the first argument is less than the second argument +function compare_versions_lt() ( set +x [ "$1" = "$2" ] && return 1 || compare_versions_lte $1 $2 -} +) function print_metadata_value() { local readonly tmpfile=$(mktemp) @@ -83,7 +87,7 @@ function print_metadata_value_if_exists() { return ${return_code} } -function get_metadata_value() { +function get_metadata_value() ( set +x local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 @@ -95,17 +99,16 @@ function get_metadata_value() { print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi - set -x + return ${return_code} -} +) -function get_metadata_attribute() { +function get_metadata_attribute() ( set +x local -r attribute_name="$1" local -r default_value="${2:-}" get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" - set -x -} +) OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') distribution=$(. /etc/os-release;echo $ID$VERSION_ID) @@ -117,21 +120,20 @@ readonly ROLE # CUDA version and Driver version # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +# https://developer.nvidia.com/cuda-downloads readonly -A DRIVER_FOR_CUDA=( - [11.8]="525.147.05" [12.1]="530.30.02" [12.4]="550.54.14" - [12.5]="555.42.06" [12.6]="560.28.03" + [11.8]="525.147.05" [12.4]="550.54.14" [12.6]="560.35.03" ) +# https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( - [11.8]="8.6.0.163" [12.1]="8.9.0" [12.4]="9.1.0.70" - [12.5]="9.2.1.18" + [11.8]="9.5.1.17" [12.4]="9.5.1.17" [12.6]="9.5.1.17" ) +# https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( - [11.8]="2.15.5" [12.1]="2.17.1" [12.4]="2.21.5" - [12.5]="2.22.3" + [11.8]="2.15.5" [12.4]="2.23.4" [12.6]="2.23.4" ) readonly -A CUDA_SUBVER=( - [11.8]="11.8.0" [12.1]="12.1.0" [12.4]="12.4.1" - [12.5]="12.5.1" + [11.8]="11.8.0" [12.4]="12.4.1" [12.6]="12.6.2" ) RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') @@ -140,8 +142,8 @@ CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") readonly CUDA_VERSION readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}" -function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } -function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } +function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) +function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) readonly DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") if is_debian11 || is_ubuntu22 || is_ubuntu20 ; then DRIVER_VERSION="560.28.03" ; fi @@ -153,8 +155,8 @@ readonly DRIVER=${DRIVER_VERSION%%.*} # Parameters for NVIDIA-provided CUDNN library readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") -function is_cudnn8() { [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; } -function is_cudnn9() { [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; } +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) if is_rocky \ && (compare_versions_lte "${CUDNN_VERSION}" "8.0.5.39") ; then CUDNN_VERSION="8.0.5.39" @@ -213,6 +215,7 @@ readonly -A DEFAULT_NVIDIA_CUDA_URLS=( [11.8]="${NVIDIA_BASE_DL_URL}/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run" [12.1]="${NVIDIA_BASE_DL_URL}/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run" [12.4]="${NVIDIA_BASE_DL_URL}/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run" + [12.6]="${NVIDIA_BASE_DL_URL}/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run" ) readonly DEFAULT_NVIDIA_CUDA_URL=${DEFAULT_NVIDIA_CUDA_URLS["${CUDA_VERSION}"]} NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") @@ -230,9 +233,10 @@ if ( compare_versions_lte "8.3.1.22" "${CUDNN_VERSION}" ); then fi CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" fi -if ( compare_versions_lte "12.0" "${CUDA_VERSION}" ); then - # When cuda version is greater than 12.0 - CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.2.0.82_cuda12-archive.tar.xz" +if is_cuda12 ; then + # When cuda version is 12 + CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz" + CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" fi readonly CUDNN_TARBALL readonly CUDNN_TARBALL_URL @@ -256,16 +260,23 @@ NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 -function execute_with_retries() { +function execute_with_retries() ( set +x local -r cmd="$*" + + if [[ "$cmd" =~ "^apt-get install" ]] ; then + apt-get -y clean + apt-get -y autoremove + fi for ((i = 0; i < 3; i++)); do - if eval "$cmd"; then set -x ; return 0 ; fi + set -x + time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +x + if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done - set -x return 1 -} +) CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { @@ -273,9 +284,9 @@ function install_cuda_keyring_pkg() { local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ - -o /tmp/cuda-keyring.deb - dpkg -i "/tmp/cuda-keyring.deb" - rm -f "/tmp/cuda-keyring.deb" + -o "${tmpdir}/cuda-keyring.deb" + dpkg -i "${tmpdir}/cuda-keyring.deb" + rm -f "${tmpdir}/cuda-keyring.deb" CUDA_KEYRING_PKG_INSTALLED="1" } @@ -295,10 +306,10 @@ function install_local_cuda_repo() { readonly DIST_KEYRING_DIR="/var/${pkgname}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${LOCAL_DEB_URL}" -o "/tmp/${LOCAL_INSTALLER_DEB}" + "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" - dpkg -i "/tmp/${LOCAL_INSTALLER_DEB}" - rm "/tmp/${LOCAL_INSTALLER_DEB}" + dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" + rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then @@ -323,11 +334,11 @@ function install_local_cudnn_repo() { # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ - "${local_deb_url}" -o /tmp/local-installer.deb + "${local_deb_url}" -o "${tmpdir}/local-installer.deb" - dpkg -i /tmp/local-installer.deb + dpkg -i "${tmpdir}/local-installer.deb" - rm -f /tmp/local-installer.deb + rm -f "${tmpdir}/local-installer.deb" cp /var/cudnn-local-repo-*-${CUDNN}*/cudnn-local-*-keyring.gpg /usr/share/keyrings @@ -354,8 +365,9 @@ function install_local_cudnn8_repo() { pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" CUDNN8_PKG_NAME="${pkgname}" - local_deb_fn="${pkgname}_1.0-1_amd64.deb" - local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${local_deb_fn}" + deb_fn="${pkgname}_1.0-1_amd64.deb" + local_deb_fn="${tmpdir}/${deb_fn}" + local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ "${local_deb_url}" -o "${local_deb_fn}" @@ -376,22 +388,25 @@ function install_nvidia_nccl() { local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" if is_rocky ; then - time execute_with_retries \ + execute_with_retries \ dnf -y -q install \ "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" + sync elif is_ubuntu ; then install_cuda_keyring_pkg apt-get update -qq if is_ubuntu18 ; then - time execute_with_retries \ + execute_with_retries \ apt-get install -q -y \ libnccl2 libnccl-dev + sync else - time execute_with_retries \ + execute_with_retries \ apt-get install -q -y \ "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" + sync fi else echo "Unsupported OS: '${OS_NAME}'" @@ -403,8 +418,8 @@ function install_nvidia_nccl() { fi } -function is_src_nvidia() { [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; } -function is_src_os() { [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; } +function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) +function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { local major_version @@ -414,17 +429,19 @@ function install_nvidia_cudnn() { if is_rocky ; then if is_cudnn8 ; then - execute_with_retries "dnf -y -q install" \ + execute_with_retries dnf -y -q install \ "libcudnn${major_version}" \ "libcudnn${major_version}-devel" + sync elif is_cudnn9 ; then - execute_with_retries "dnf -y -q install" \ + execute_with_retries dnf -y -q install \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" + sync else echo "Unsupported cudnn version: '${major_version}'" fi - elif is_debian || is_ubuntu; then + elif is_debuntu; then if is_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else @@ -438,6 +455,7 @@ function install_nvidia_cudnn() { apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" + sync elif is_cudnn9 ; then install_cuda_keyring_pkg @@ -448,6 +466,7 @@ function install_nvidia_cudnn() { "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" + sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi @@ -458,7 +477,8 @@ function install_nvidia_cudnn() { "libcudnn${major_version}=${cudnn_pkg_version}" "libcudnn${major_version}-dev=${cudnn_pkg_version}") execute_with_retries \ - "apt-get install -q -y --no-install-recommends ${packages[*]}" + apt-get install -q -y --no-install-recommends "${packages[*]}" + sync else echo "Unsupported OS: '${OS_NAME}'" exit 1 @@ -579,7 +599,7 @@ function add_nonfree_components() { } function add_repo_nvidia_container_toolkit() { - if is_debian || is_ubuntu ; then + if is_debuntu ; then local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html @@ -595,7 +615,7 @@ function add_repo_nvidia_container_toolkit() { } function add_repo_cuda() { - if is_debian || is_ubuntu ; then + if is_debuntu ; then local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ @@ -624,8 +644,7 @@ function build_driver_from_github() { tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ - -o "${tarball_fn}" - tar xzf "${tarball_fn}" + | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } cd open-gpu-kernel-modules @@ -633,6 +652,7 @@ function build_driver_from_github() { time make -j$(nproc) modules \ > /var/log/open-gpu-kernel-modules-build.log \ 2> /var/log/open-gpu-kernel-modules-build_error.log + sync if [[ -n "${PSN}" ]]; then #configure_dkms_certs @@ -669,38 +689,41 @@ function build_driver_from_packages() { fi add_contrib_component apt-get update -qq - execute_with_retries "apt-get install -y -qq --no-install-recommends dkms" + execute_with_retries apt-get install -y -qq --no-install-recommends dkms #configure_dkms_certs - time execute_with_retries "apt-get install -y -qq --no-install-recommends ${pkglist[@]}" + execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" + sync elif is_rocky ; then #configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else - time execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + execute_with_retries dnf -y -q module install 'nvidia-driver:latest' fi + sync fi #clear_dkms_key } function install_nvidia_userspace_runfile() { - if test -d /run/nvidia-userspace ; then return ; fi + if test -f "${tmpdir}/userspace-complete" ; then return ; fi curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${USERSPACE_URL}" -o userspace.run - time bash "./userspace.run" --no-kernel-modules --silent --install-libglvnd \ - > /dev/null 2>&1 - rm -f userspace.run - mkdir -p /run/nvidia-userspace + "${USERSPACE_URL}" -o "${tmpdir}/userspace.run" + execute_with_retries bash "${tmpdir}/userspace.run" --no-kernel-modules --silent --install-libglvnd --tmpdir="${tmpdir}" + rm -f "${tmpdir}/userspace.run" + touch "${tmpdir}/userspace-complete" + sync } function install_cuda_runfile() { - if test -d /run/nvidia-cuda ; then return ; fi - curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ - "${NVIDIA_CUDA_URL}" -o cuda.run - time bash "./cuda.run" --silent --toolkit --no-opengl-libs - rm -f cuda.run - mkdir -p /run/nvidia-cuda + if test -f "${tmpdir}/cuda-complete" ; then return ; fi + time curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${NVIDIA_CUDA_URL}" -o "${tmpdir}/cuda.run" + execute_with_retries bash "${tmpdir}/cuda.run" --silent --toolkit --no-opengl-libs --tmpdir="${tmpdir}" + rm -f "${tmpdir}/cuda.run" + touch "${tmpdir}/cuda-complete" + sync } function install_cuda_toolkit() { @@ -712,11 +735,13 @@ function install_cuda_toolkit() { fi cuda_package="cuda=${CUDA_FULL_VERSION}-1" readonly cudatk_package - if is_ubuntu || is_debian ; then + if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi - time execute_with_retries "apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}" + execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} + sync elif is_rocky ; then - time execute_with_retries "dnf -y -q install ${cudatk_package}" + execute_with_retries dnf -y -q install "${cudatk_package}" + sync fi } @@ -774,7 +799,7 @@ function install_nvidia_gpu_driver() { load_kernel_module install_cuda_runfile - elif is_debian || is_ubuntu ; then + elif is_debuntu ; then install_cuda_keyring_pkg build_driver_from_packages @@ -796,7 +821,11 @@ function install_nvidia_gpu_driver() { exit 1 fi ldconfig - echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + if is_src_os ; then + echo "NVIDIA GPU driver provided by ${OS_NAME} was installed successfully" + else + echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" + fi } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics @@ -812,7 +841,8 @@ function install_gpu_agent() { "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" - pip install -r "${install_dir}/requirements.txt" + execute_with_retries pip install -r "${install_dir}/requirements.txt" + sync # Generate GPU service. cat </lib/systemd/system/gpu-utilization-agent.service @@ -837,7 +867,6 @@ EOF systemctl --no-reload --now enable gpu-utilization-agent.service } -readonly bdcfg="/usr/local/bin/bdconfig" function set_hadoop_property() { local -r config_file=$1 local -r property=$2 @@ -991,7 +1020,6 @@ EOF systemctl start dataproc-cgroup-device-permissions } -nvsmi_works="0" function nvsmi() { local nvsmi="/usr/bin/nvidia-smi" if [[ "${nvsmi_works}" == "1" ]] ; then echo "nvidia-smi is working" >&2 @@ -1018,12 +1046,14 @@ function main() { remove_old_backports - if is_debian || is_ubuntu ; then + if is_debuntu ; then export DEBIAN_FRONTEND=noninteractive - execute_with_retries "apt-get install -y -qq pciutils linux-headers-${uname_r}" + execute_with_retries apt-get install -y -qq pciutils "linux-headers-${uname_r}" > /dev/null 2>&1 elif is_rocky ; then - execute_with_retries "dnf -y -q update --exclude=systemd*,kernel*" - execute_with_retries "dnf -y -q install pciutils gcc" + execute_with_retries dnf -y -q update --exclude=systemd*,kernel* \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } + execute_with_retries dnf -y -q install pciutils gcc \ + > "${install_log}" 2>&1 || { cat "${install_log}" && exit -4 ; } local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" local kernel_devel_pkg_out="$(eval "${dnf_cmd} 2>&1")" @@ -1035,7 +1065,9 @@ function main() { "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ - "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" \ + > "${install_log}" 2>&1 || { cat "${install_log}" ; exit -4 ; } + sync else execute_with_retries "${dnf_cmd}" fi @@ -1197,8 +1229,10 @@ function clean_up_sources_lists() { # cran-r # if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then + keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" + if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi rm -f /usr/share/keyrings/cran-r.gpg - curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7' | \ + curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ gpg --dearmor -o /usr/share/keyrings/cran-r.gpg sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi @@ -1213,21 +1247,184 @@ function clean_up_sources_lists() { sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi - if -f /etc/apt/trusted.gpg ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi + if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi } -if is_debian ; then - clean_up_sources_lists - apt-get update - if is_debian12 ; then - apt-mark unhold systemd libsystemd0 ; fi -fi +function exit_handler() { + set +ex + echo "Exit handler invoked" -configure_dkms_certs + # Purge private key material until next grant + clear_dkms_key -main + # Free conda cache + /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1 + + # Clear pip cache + pip cache purge || echo "unable to purge pip cache" + + # If system memory was sufficient to mount memory-backed filesystems + if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + # Stop hadoop services + systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + + # remove the tmpfs conda pkgs_dirs + /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm || echo "unable to remove pkgs_dirs conda config" + + # remove the tmpfs pip cache-dir + pip config unset global.cache-dir || echo "unable to unset global pip cache" + + # Clean up shared memory mounts + for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do + if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then + rm -rf ${shmdir}/* + sync + sleep 3s + execute_with_retries umount -f ${shmdir} + fi + done + + umount -f /tmp + systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + fi -clear_dkms_key + # Clean up OS package cache ; re-hold systemd package + if is_debuntu ; then + apt-get -y -qq clean + apt-get -y -qq autoremove + if is_debian12 ; then + apt-mark hold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # print disk usage statistics for large components + if is_ubuntu ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + elif is_debian ; then + du -hs \ + /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ + /usr/lib \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + else + du -hs \ + /var/lib/docker \ + /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ + /usr/lib64/google-cloud-sdk \ + /usr/lib \ + /opt/nvidia/* \ + /usr/local/cuda-1?.? \ + /opt/conda/miniconda3 + fi -df -h + # Process disk usage logs from installation period + rm -f /run/keep-running-df + sync + sleep 5.01s + # compute maximum size of disk during installation + # Log file contains logs like the following (minus the preceeding #): +#Filesystem 1K-blocks Used Available Use% Mounted on +#/dev/vda2 7096908 2611344 4182932 39% / + df / | tee -a "/run/disk-usage.log" + + perl -e '@siz=( sort { $a => $b } + map { (split)[2] =~ /^(\d+)/ } + grep { m:^/: } ); +$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; +print( " samples-taken: ", scalar @siz, $/, + "maximum-disk-used: $max", $/, + "minimum-disk-used: $min", $/, + " increased-by: $inc", $/ )' < "/run/disk-usage.log" + + echo "exit_handler has completed" + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + dd if=/dev/zero of=/zero + sync + sleep 3s + rm -f /zero + fi + + return 0 +} + +function prepare_to_install(){ + nvsmi_works="0" + readonly bdcfg="/usr/local/bin/bdconfig" + tmpdir=/tmp/ + local free_mem + trap exit_handler EXIT + free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" + # Write to a ramdisk instead of churning the persistent disk + if [[ ${free_mem} -ge 10500000 ]]; then + # Services might use /tmp for temporary files + echo "debug: this may break things!" + systemctl list-units | perl -n -e 'qx(systemctl stop $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + sudo mount -t tmpfs tmpfs /tmp + systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' + + tmpdir="/mnt/shm" + mkdir -p "${tmpdir}" + mount -t tmpfs tmpfs "${tmpdir}" + + # Download conda packages to tmpfs + /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" + + # Download pip packages to tmpfs + pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" + + # Download OS packages to tmpfs + if is_debuntu ; then + mount -t tmpfs tmpfs /var/cache/apt/archives + else + mount -t tmpfs tmpfs /var/cache/dnf + fi + else + tmpdir=/tmp + fi + install_log="${tmpdir}/install.log" + + if is_debuntu ; then + clean_up_sources_lists + apt-get update -qq + apt-get -y clean + apt-get -y -qq autoremove + if is_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi + else + dnf clean all + fi + + # Clean conda cache + /opt/conda/miniconda3/bin/conda clean -a + + # zero free disk space + if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + time dd if=/dev/zero of=/zero status=progress ; sync ; sleep 3s ; rm -f /zero + ) fi + + configure_dkms_certs + + # Monitor disk usage in a screen session + if is_debuntu ; then + execute_with_retries apt-get install -y -qq screen + else + execute_with_retries dnf -y -q install screen + fi + df / > "/run/disk-usage.log" + touch "/run/keep-running-df" + screen -d -m -US keep-running-df \ + bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" +} + +prepare_to_install + +main diff --git a/integration_tests/dataproc_test_case.py b/integration_tests/dataproc_test_case.py index 936718498..0e783a87b 100644 --- a/integration_tests/dataproc_test_case.py +++ b/integration_tests/dataproc_test_case.py @@ -21,7 +21,7 @@ flags.DEFINE_boolean('skip_cleanup', False, 'Skip cleanup of test resources') FLAGS(sys.argv) -INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "false").lower() == "true" +INTERNAL_IP_SSH = os.getenv("INTERNAL_IP_SSH", "true").lower() == "true" DEFAULT_TIMEOUT = 15 # minutes @@ -114,12 +114,12 @@ def createCluster(self, ] args = self.DEFAULT_ARGS[configuration].copy() + args.append("--no-address") + if FLAGS.image: args.append("--image={}".format(FLAGS.image)) elif FLAGS.image_version: args.append("--image-version={}".format(FLAGS.image_version)) - if FLAGS.image_version in self.IMAGE_VERSION_2_2: - args.append("--public-ip-address") for i in init_actions: if "install_gpu_driver.sh" in i or "horovod.sh" in i or \ @@ -356,10 +356,10 @@ def random_str(size=4, chars=string.ascii_lowercase + string.digits): @staticmethod def run_command(cmd, timeout_in_minutes=DEFAULT_TIMEOUT): cmd = cmd.replace( - "gcloud compute ssh ", "gcloud compute ssh --internal-ip ") if ( + "gcloud compute ssh ", "gcloud compute ssh --tunnel-through-iap ") if ( INTERNAL_IP_SSH and "gcloud compute ssh " in cmd) else cmd cmd = cmd.replace("gcloud compute scp ", - "gcloud beta compute scp --internal-ip ") if ( + "gcloud beta compute scp --tunnel-through-iap ") if ( INTERNAL_IP_SSH and "gcloud compute scp " in cmd) else cmd p = subprocess.Popen(