diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index aebaffd84..fcc0f3918 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -22,7 +22,7 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ dd of="${bazel_repo_file}" status=none && \ apt-get update -qq RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ - apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ + apt-get install -y -qq default-jdk python3-setuptools python3-pip bazel-${bazel_version} > /dev/null 2>&1 && \ apt-get clean # Set bazel-${bazel_version} as the default bazel alternative in this container @@ -30,3 +30,7 @@ RUN update-alternatives --install /usr/bin/bazel bazel /usr/bin/bazel-${bazel_ve update-alternatives --set bazel /usr/bin/bazel-${bazel_version} USER ia-tests + +# Install Python dependencies +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install -r /init-actions/requirements.txt diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh index 93e9ce6dd..c66814eb6 100644 --- a/cloudbuild/presubmit.sh +++ b/cloudbuild/presubmit.sh @@ -70,6 +70,7 @@ determine_tests_to_run() { changed_dir="${changed_dir%%/*}/" # Run all tests if common directories modified if [[ ${changed_dir} =~ ^(integration_tests|util|cloudbuild)/$ ]]; then + continue # remove this line before submission echo "All tests will be run: '${changed_dir}' was changed" TESTS_TO_RUN=(":DataprocInitActionsTestSuite") return 0 diff --git a/cloudbuild/run-presubmit-on-k8s.sh b/cloudbuild/run-presubmit-on-k8s.sh index f31d0c876..5e2001c91 100644 --- a/cloudbuild/run-presubmit-on-k8s.sh +++ b/cloudbuild/run-presubmit-on-k8s.sh @@ -66,11 +66,21 @@ kubectl wait --for=condition=Ready "pod/${POD_NAME}" --timeout=15m # To mitigate problems with early test failure, retry kubectl logs sleep 10s -while ! kubectl describe "pod/${POD_NAME}" | grep -q Terminated; do - # Try to stream logs, but primary log capture is now in the trap +while true; do + if ! kubectl describe "pod/${POD_NAME}" > /dev/null 2>&1; then + echo "Pod ${POD_NAME} not found, assuming it has been deleted." + break # Exit the loop if the pod doesn't exist + fi + + if kubectl describe "pod/${POD_NAME}" | grep -q Terminated; then + echo "Pod ${POD_NAME} is Terminated." + break # Exit the loop if the pod is Terminated + fi + + # Try to stream logs kubectl logs -f "${POD_NAME}" --since-time="${LOGS_SINCE_TIME}" --timestamps=true || true LOGS_SINCE_TIME=$(date --iso-8601=seconds) - sleep 2 # Short sleep to avoid busy waiting if logs -f exits + sleep 2 done # Final check on the pod exit code diff --git a/gpu/BUILD b/gpu/BUILD index b481c5b33..1a31d0afc 100644 --- a/gpu/BUILD +++ b/gpu/BUILD @@ -2,6 +2,17 @@ package(default_visibility = ["//visibility:public"]) exports_files(["install_gpu_driver.sh", "mig.sh"]) +py_library( + name = "gpu_test_case_base", + srcs = ["gpu_test_case_base.py"], + srcs_version = "PY3", + testonly = True, # Add this line + deps = [ + "//integration_tests:dataproc_test_case", + "@io_abseil_py//absl/testing:parameterized", + ], +) + py_test( name = "test_gpu", size = "enormous", @@ -10,7 +21,8 @@ py_test( local = True, shard_count = 15, deps = [ + ":gpu_test_case_base", # Add this dependency "//integration_tests:dataproc_test_case", "@io_abseil_py//absl/testing:parameterized", ], -) +) \ No newline at end of file diff --git a/gpu/Dockerfile b/gpu/Dockerfile index 05724eb8c..d0e5f872e 100644 --- a/gpu/Dockerfile +++ b/gpu/Dockerfile @@ -40,6 +40,7 @@ RUN apt-get -y -qq install emacs-nox vim uuid-runtime > /dev/null 2>&1 && apt-ge WORKDIR /init-actions USER ia-tests +COPY --chown=ia-tests:ia-tests "cloudbuild/key.json" /key.json COPY --chown=ia-tests:ia-tests . ${WORKDIR} ENTRYPOINT ["/bin/bash"] diff --git a/gpu/README.md b/gpu/README.md index cb92b40c2..c4b2935eb 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -7,297 +7,238 @@ worker nodes in a Dataproc cluster. ## Default versions -A default version will be selected from the nvidia [support -matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) -for CUDA, the nvidia kernel driver, cuDNN, and NCCL. +A default version will be selected from NVIDIA's guidance, similar to the +[NVIDIA Deep Learning Frameworks Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html), +for CUDA, the NVIDIA kernel driver, cuDNN, and NCCL. Specifying a supported value for the `cuda-version` metadata variable -will select the following values for Driver, CuDNN and NCCL. At the -time of writing, the default value for cuda-version, if unspecified is -12.4. In addition to 12.4, we have also tested with 11.8, 12.0 and 12.6. +will select compatible values for Driver, cuDNN, and NCCL from the script's +internal matrix. Default CUDA versions are typically: -CUDA | Full Version | Driver | CuDNN | NCCL | Tested Dataproc Image Versions ------| ------------ | --------- | --------- | ------- | ------------------- -11.8 | 11.8.0 | 560.35.03 | 8.6.0.163 | 2.15.5 | 2.0, 2.1, 2.2-ubuntu22 -12.0 | 12.0.0 | 550.90.07 | 8.8.1.3, | 2.16.5 | 2.0, 2.1, 2.2-rocky9, 2.2-ubuntu22 -12.4 | 12.4.1 | 550.90.07 | 9.1.0.70 | 2.23.4 | 2.1-ubuntu20, 2.1-rocky8, 2.2 -12.6 | 12.6.2 | 560.35.03 | 9.5.1.17 | 2.23.4 | 2.1-ubuntu20, 2.1-rocky8, 2.2 + * Dataproc 2.0: `12.1.1` + * Dataproc 2.1: `12.4.1` + * Dataproc 2.2 & 2.3: `12.6.3` -All variants in the preceeding table have been manually tested to work -with the installer. Supported OSs at the time of writing are: +*(Note: The script supports a wider range of specific versions. +Refer to internal arrays in `install_gpu_driver.sh` for the full matrix.)* -* Debian 10, 11 and 12 -* Ubuntu 18.04, 20.04, and 22.04 LTS -* Rocky 8 and 9 +**Example Tested Configurations (Illustrative):** + +CUDA | Full Version | Driver | cuDNN | NCCL | Tested Dataproc Image Versions +-----| ------------ | --------- | --------- | -------| --------------------------- +11.8 | 11.8.0 | 525.147.05| 9.5.1.17 | 2.21.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky); 2.2 (Ubuntu 22.04) +12.0 | 12.0.1 | 525.147.05| 8.8.1.3 | 2.16.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky); 2.2 (Rocky 9, Ubuntu 22.04) +12.4 | 12.4.1 | 550.135 | 9.1.0.70 | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+ +12.6 | 12.6.3 | 550.142 | 9.6.0.74 | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+ + +**Supported Operating Systems:** + + * Debian 10, 11, 12 + * Ubuntu 18.04, 20.04, 22.04 LTS + * Rocky Linux 8, 9 ## Using this initialization action **:warning: NOTICE:** See -[best practices](/README.md#how-initialization-actions-are-used) of using -initialization actions in production. +[best practices](/README.md#how-initialization-actions-are-used) +of using initialization actions in production. -You can use this initialization action to create a new Dataproc cluster with GPU -support - it will install NVIDIA GPU drivers and CUDA on cluster nodes with -attached GPU adapters. +This initialization action will install NVIDIA GPU drivers and the CUDA toolkit. +Optional components like cuDNN, NCCL, and PyTorch can be included via +metadata. -1. Use the `gcloud` command to create a new cluster with NVIDIA-provided GPU - drivers and CUDA installed by initialization action. +1. Use the `gcloud` command to create a new cluster with this initialization + action. The following command will create a new cluster named + `` and install default GPU drivers (GPU agent is enabled + by default). ```bash REGION= CLUSTER_NAME= + DATAPROC_IMAGE_VERSION= # e.g., 2.2-debian12 + gcloud dataproc clusters create ${CLUSTER_NAME} \ - --region ${REGION} \ - --master-accelerator type=nvidia-tesla-t4 \ - --worker-accelerator type=nvidia-tesla-t4,count=4 \ - --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh + --region ${REGION} \ + --image-version ${DATAPROC_IMAGE_VERSION} \ + --master-accelerator type=nvidia-tesla-t4,count=1 \ + --worker-accelerator type=nvidia-tesla-t4,count=2 \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh \ + --scopes https://www.googleapis.com/auth/monitoring.write # For GPU agent ``` -1. Use the `gcloud` command to create a new cluster with NVIDIA GPU drivers - and CUDA installed by initialization action as well as the GPU - monitoring service. The monitoring service is supported on Dataproc 2.0+ Debian - and Ubuntu images. - - *Prerequisite:* Create GPU metrics in - [Cloud Monitoring](https://cloud.google.com/monitoring/docs/) using Google - Cloud Shell with the - [create_gpu_metrics.py](https://github.com/GoogleCloudPlatform/ml-on-gcp/blob/master/dlvm/gcp-gpu-utilization-metrics/create_gpu_metrics.py) - script. - - If you run this script locally you will need to set up a service account. +2. Use the `gcloud` command to create a new cluster specifying a custom CUDA + version and providing direct HTTP/HTTPS URLs for the driver and CUDA + `.run` files. This example also disables the GPU agent. ```bash - export GOOGLE_CLOUD_PROJECT= - - git clone https://github.com/GoogleCloudPlatform/ml-on-gcp.git - cd ml-on-gcp/dlvm/gcp-gpu-utilization-metrics - pip install -r ./requirements.txt - python create_gpu_metrics.py - ``` - - Expected output: + REGION= + CLUSTER_NAME= + DATAPROC_IMAGE_VERSION= # e.g., 2.2-ubuntu22 + MY_DRIVER_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/550.90.07/NVIDIA-Linux-x86_64-550.90.07.run" + MY_CUDA_URL="https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run" - ``` - Created projects/project-sample/metricDescriptors/custom.googleapis.com/utilization_memory. - Created projects/project-sample/metricDescriptors/custom.googleapis.com/utilization_gpu. - Created projects/project-sample/metricDescriptors/custom.googleapis.com/memory_used + gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --image-version ${DATAPROC_IMAGE_VERSION} \ + --master-accelerator type=nvidia-tesla-t4,count=1 \ + --worker-accelerator type=nvidia-tesla-t4,count=2 \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh \ + --metadata gpu-driver-url=${MY_DRIVER_URL},cuda-url=${MY_CUDA_URL},install-gpu-agent=false ``` - Create cluster: +3. To create a cluster with Multi-Instance GPU (MIG) enabled (e.g., for + NVIDIA A100 GPUs), you must use this `install_gpu_driver.sh` script + for the base driver installation, and additionally specify `gpu/mig.sh` + as a startup script. ```bash REGION= CLUSTER_NAME= + DATAPROC_IMAGE_VERSION= # e.g., 2.2-rocky9 + gcloud dataproc clusters create ${CLUSTER_NAME} \ - --region ${REGION} \ - --master-accelerator type=nvidia-tesla-t4 \ - --worker-accelerator type=nvidia-tesla-t4,count=4 \ - --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh \ - --metadata install-gpu-agent=true \ - --scopes https://www.googleapis.com/auth/monitoring.write + --region ${REGION} \ + --image-version ${DATAPROC_IMAGE_VERSION} \ + --worker-machine-type a2-highgpu-1g \ + --worker-accelerator type=nvidia-tesla-a100,count=1 \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh \ + --properties "dataproc:startup.script.uri=gs://goog-dataproc-initialization-actions-${REGION}/gpu/mig.sh" \ + --metadata MIG_CGI='1g.5gb,1g.5gb,1g.5gb,1g.5gb,1g.5gb,1g.5gb,1g.5gb' # Example MIG profiles ``` -1. Use the `gcloud` command to create a new cluster using Multi-Instance GPU (MIG) feature of the - NVIDIA Ampere architecture. This creates a cluster with the NVIDIA GPU drivers - and CUDA installed and the Ampere based GPU configured for MIG. +### Using for Custom Image Creation - After cluster creation each MIG instance will show up like a regular GPU to YARN. For instance, if you requested - 2 workers each with 1 A100 and used the default 2 MIG instances per A100, the cluster would have a total of 4 GPUs - that can be allocated. +When this `install_gpu_driver.sh` script is used as a `customization-script` +for building custom Dataproc images (e.g., with tools from the +`GoogleCloudDataproc/custom-images` repository like `generate_custom_image.py`), +some configurations need to be deferred. - It is important to note that CUDA 11 only supports enumeration of a single MIG instance. It is recommended that you - only request a single MIG instance per container. For instance, if running Spark only request - 1 GPU per executor (spark.executor.resource.gpu.amount=1). Please see the - [MIG user guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/) for more information. + * The image building tool should pass the metadata + `--metadata invocation-type=custom-images` to the temporary instance + used during image creation. + * This instructs `install_gpu_driver.sh` to install drivers and tools + but defer Hadoop/Spark-specific configurations to the first boot of an + instance created from this custom image. This is handled via a systemd + service (`dataproc-gpu-config.service`). + * End-users creating clusters *from* such a custom image do **not** set + the `invocation-type` metadata. - First decide which Amphere based GPU you are using. In the example we use the A100. - Decide the number of MIG instances and [instance profiles to use](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#lgi). - By default if the MIG profiles are not specified it will configure 2 MIG instances with profile id 9. If - a different instance profile is required, you can specify it in the MIG_CGI metadata parameter. Either a - profile id or the name (ie 3g.20gb) can be specified. For example: +Example command for `generate_custom_image.py` (simplified): - ```bash - --metadata=^:^MIG_CGI='3g.20gb,9' - ``` - - Create cluster with MIG enabled: - - ```bash - REGION= - CLUSTER_NAME= - gcloud dataproc clusters create ${CLUSTER_NAME} \ - --region ${REGION} \ - --worker-machine-type a2-highgpu-1g - --worker-accelerator type=nvidia-tesla-a100,count=1 \ - --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh \ - --metadata=startup-script-url=gs://goog-dataproc-initialization-actions-${REGION}/gpu/mig.sh - ``` +```bash +python generate_custom_image.py \ + # ... other generate_custom_image.py arguments ... + --customization-script gs:///gpu/install_gpu_driver.sh \ + --metadata invocation-type=custom-images,cuda-version=12.6 # Plus other desired metadata +``` -#### GPU Scheduling in YARN: - -YARN is the default Resource Manager for Dataproc. To use GPU scheduling feature -in Spark, it requires YARN version >= 2.10 or >= 3.1.1. If intended to use Spark -with Deep Learning use case, it recommended to use YARN >= 3.1.3 to get support -for [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit). - -In current Dataproc set up, we enable GPU resource isolation by initialization -script with NVIDIA container toolkit. You can find more information at -[NVIDIA Spark RAPIDS getting started guide](https://nvidia.github.io/spark-rapids/). - -#### cuDNN - -You can also install [cuDNN](https://developer.nvidia.com/CUDNN) on your -cluster. cuDNN is used as a backend for Deep Learning frameworks, such as -TensorFlow. A reasonable default will be selected. To explicitly select a -version, include the metadata parameter `--metadata cudnn-version=x.x.x.x`. You -can find the list of archived versions -[here](https://developer.nvidia.com/rdp/cudnn-archive) which includes all -versions except the latest. To locate the version you need, click on Download -option for the correct cuDNN + CUDA version you desire, copy the link address -for the `cuDNN Runtime Library for Ubuntu18.04 x86_64 (Deb)` file of the -matching CUDA version and find the full version from the deb file. For instance, -for `libcudnn8_8.0.4.30-1+cuda11.0_amd64.deb`, the version is `8.0.4.30`. Below -is a table for mapping some recent major.minor cuDNN versions to full versions -and compatible CUDA versions: - -Major.Minor | Full Version | CUDA Versions | Release Date ------------ | ------------ | -------------------------- | ------------ -8.6 | 8.6.0.163 | 10.2, 11.8 | 2022-09-22 -8.5 | 8.5.0.96 | 10.2, 11.7 | 2022-08-04 -8.4 | 8.4.1.50 | 10.2, 11.6 | 2022-05-27 -8.3 | 8.3.3.40 | 10.2, 11.5 | 2022-03-18 -8.2 | 8.2.4.15 | 10.2, 11.4 | 2021-08-31 -8.1 | 8.1.1.33 | 10.2, 11.2 | 2021-02-25 -8.0 | 8.0.5.39 | 10.1, 10.2, 11.0, 11.1 | 2020-11-01 -7.6 | 7.6.5.32 | 9.0, 9.2, 10.0, 10.1, 10.2 | 2019-10-28 -7.5 | 7.5.1.10 | 9.0, 9.2, 10.0, 10.1 | 2019-04-17 - -To figure out which version you need, refer to the framework's documentation, -sometimes found in the "building from source" sections. -[Here](https://www.tensorflow.org/install/source#gpu) is TensorFlow's. - -#### Metadata parameters: - -- `install-gpu-agent: true|false` - this is an optional parameter with - case-sensitive value. Default is `false`. - - **Note:** This parameter will collect GPU utilization and send statistics to - Stackdriver. Make sure you add the correct scope to access Stackdriver. - -- `gpu-driver-url: ` - this is an optional parameter for customizing - NVIDIA-provided GPU driver on Debian. Default is - `https://download.nvidia.com/XFree86/Linux-x86_64/495.29.05/NVIDIA-Linux-x86_64-495.29.05.run` - -- `cuda-url: ` - this is an optional parameter for customizing - NVIDIA-provided CUDA on Debian. This is required if not using CUDA `10.1` or - `10.2` with a Debian image. Please find the appropriate linux-based - runtime-file URL [here](https://developer.nvidia.com/cuda-toolkit-archive). - Default is - `https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run` - -- `rapids-runtime: SPARK|DASK|` - this is an optional parameter for - customizing the rapids runtime. Default is `SPARK`. - -- `cuda-version: 10.1|10.2|` - this is an optional parameter for - customizing NVIDIA-provided CUDA version. Default is `11.5`. - -- `nccl-version: 2.8.3|2.11.4|` - this is an optional parameter for - customizing NVIDIA-provided NCCL version. Default is `2.11.4`. - -- `gpu-driver-version: 460.73.01|495.29.05|` - this is an optional - parameter for customizing NVIDIA-provided kernel driver version. Default is - `495.29.05`. - -- `cudnn-version: ` - this is an optional parameter for installing - [NVIDIA cuDNN](https://developer.nvidia.com/CUDNN) version `x.x.x.x`. - Default is `8.3.3.40`. - -- `private_secret_name: ` - -- `public_secret_name: ` - -- `secret_version: ` - -- `secret_project: ` - -- `cert_modulus_md5sum: ` -These arguments can be used to - specify the driver signing parameters. The certificate named by - `public_secret_name` must be included in the boot sector of the - disk from which the cluster is booted. The key named by - `private_secret_name` must correspond to the certificate named by - `public_secret_name`, and the `cert_modulus_md5sum` must match the - modulus md5sum of the files referenced by both the private and - public secret names. - -- `http-proxy: :` - Optional. The address of an HTTP - proxy to use for internet egress. The script will configure `apt`, - `curl`, `gsutil`, `pip`, `java`, and `gpg` to use this proxy. - -- `http-proxy-pem-uri: ` - Optional. A `gs://` path to the +### GPU Scheduling in YARN: + +This script configures YARN, Dataproc's default Resource Manager, for GPU +awareness. + + * It sets `yarn.io/gpu` as a resource type. + * It configures the `LinuxContainerExecutor` and cgroups for GPU isolation. + * It installs a GPU discovery script (`getGpusResources.sh`) for Spark, which + caches results to minimize `nvidia-smi` calls. + * Spark default configurations in `/etc/spark/conf/spark-defaults.conf` + are updated with GPU-related properties (e.g., + `spark.executor.resource.gpu.amount`) and the RAPIDS Spark plugin + (`com.nvidia.spark.SQLPlugin`) is commonly configured. + +### cuDNN + +This script can install [NVIDIA cuDNN](https://developer.nvidia.com/CUDNN), +a GPU-accelerated library for deep neural networks. + + * If `include-pytorch=yes` is specified or `cudnn-version` is provided, + a compatible version of cuDNN will be selected and installed based on the + determined CUDA version. + * To install a specific version of cuDNN, use the `cudnn-version` metadata + parameter (e.g., `--metadata cudnn-version=8.9.7.29`). Please consult the + [cuDNN Archive](https://developer.nvidia.com/rdp/cudnn-archive) and your + deep learning framework's documentation for CUDA compatibility. The script + may use `libcudnn` packages or tarball installations. + +**Example cuDNN Version Mapping (Illustrative):** + +| cuDNN Major.Minor | Example Full Version | Compatible CUDA Versions (General) | +|-------------------|----------------------|------------------------------------| +| 8.6 | 8.6.0.163 | 10.2, 11.x | +| 8.9 | 8.9.7.29 | 11.x, 12.x | +| 9.x | e.g., 9.6.0.74 | 12.x | + +### Metadata Parameters: + +This script accepts the following metadata parameters: + + * `install-gpu-agent`: `true`|`false`. **Default: `true`**. + Installs GPU monitoring agent. Requires the + `https://www.googleapis.com/auth/monitoring.write` scope. + * `cuda-version`: (Optional) Specify desired CUDA version (e.g., `11.8`, + `12.4.1`). Overrides default CUDA selection. + * `cuda-url`: (Optional) HTTP/HTTPS URL to a specific CUDA toolkit `.run` file + (e.g., `https://developer.download.nvidia.com/.../cuda_12.4.1_..._linux.run`). + Fetched using `curl`. Overrides `cuda-version` and default selection. + * `gpu-driver-version`: (Optional) Specify NVIDIA driver version (e.g., + `550.90.07`). Overrides default compatible driver selection. + * `gpu-driver-url`: (Optional) HTTP/HTTPS URL to a specific NVIDIA driver + `.run` file (e.g., `https://us.download.nvidia.com/.../NVIDIA-Linux-x86_64-...run`). + Fetched using `curl`. Overrides `gpu-driver-version`. + * `gpu-driver-provider`: (Optional) `OS`|`NVIDIA`. Default: `NVIDIA`. + Determines preference for OS-provided vs. NVIDIA-direct drivers. + The script often prioritizes `.run` files or source builds for reliability. + * `cudnn-version`: (Optional) Specify cuDNN version (e.g., `8.9.7.29`). + * `nccl-version`: (Optional) Specify NCCL version. + * `include-pytorch`: (Optional) `yes`|`no`. Default: `no`. + If `yes`, installs PyTorch, TensorFlow, RAPIDS, and PySpark in a Conda + environment. + * `gpu-conda-env`: (Optional) Name for the PyTorch Conda environment. + Default: `dpgce`. + * `container-runtime`: (Optional) E.g., `docker`, `containerd`, `crio`. + For NVIDIA Container Toolkit configuration. Auto-detected if not specified. + * `http-proxy`: (Optional) URL of an HTTP proxy for downloads. + * `http-proxy-pem-uri`: (Optional) A `gs://` path to the PEM-encoded certificate file used by the proxy specified in `http-proxy`. This is needed if the proxy uses TLS and its certificate is not already trusted by the cluster's default trust store (e.g., if it's a self-signed certificate or signed by an internal CA). The script will install this certificate into the system and Java trust stores. + * `invocation-type`: (For Custom Images) Set to `custom-images` by image + building tools. Not typically set by end-users creating clusters. + * **Secure Boot Signing Parameters:** Used if Secure Boot is enabled and + you need to sign kernel modules built from source. + ```text + private_secret_name= + public_secret_name= + secret_project= + secret_version= + modulus_md5sum= + ``` -#### Loading built kernel module - -For platforms which do not have pre-built binary kernel drivers, the -script will execute the .run file, installing the kernel driver -support libraries. - -In addition to installing the support libraries, the open kernel -module is fetched from github and built locally. There are metadata -attributes which can be used to specify the MOK key used to sign -kernel modules for use with secure boot. - -- `private_secret_name: ` - -- `public_secret_name: ` - -- `secret_version: ` - -- `secret_project: ` - - -Please see custom-images/examples/secure-boot/create-key-pair.sh for -details on what these attributes are and how they are used. - -In order to load a kernel module built from source, either the -`--no-shielded-secure-boot` argument must be passed to `gcloud -dataproc clusters create`, or a trusted certificate must be included -in the cluster's base image using the custom-image script, and secret -names storing signing material must be supplied using metadata -arguments. Attempts to build from source with misconfigured or -missing certificates will result in an error similar to the following: - -``` -ERROR: The kernel module failed to load. Secure boot is enabled on this system, so this is likely because it was not signed by a key that is trusted by the kernel. Please try installing the driver again, and sign the kernel module when prompted to do so. -ERROR: Unable to load the kernel module 'nvidia.ko'. This happens most frequently when this kernel module was built against the wrong or improperly configured kernel sources, with a version of gcc that differs from the one used to build the target kernel, or if another driver, such as nouveau, is present and prevents the NVIDIA kernel module from obtaining ownership of the NVIDIA device(s), or no NVIDIA device installed in this system is supported by this NVIDIA Linux graphics driver release. -Please see the log entries 'Kernel module load error' and 'Kernel messages' at the end of the file '/var/log/nvidia-installer.log' for more information. -ERROR: Installation has failed. Please see the file '/var/log/nvidia-installer.log' for details. You may find suggestions on fixing installation problems in the README available on the Linux driver download page at www.nvidia.com. -``` - -The simple but unsecured resolution to this problem is to pass the -`--no-shielded-secure-boot` argument to `gcloud dataproc clusters -create` so that the unsigned kernel module built from source can be -loaded into the running kernel. - -The complex but secure resolution is to run the -custom-images/examples/secure-boot/create-key-pair.sh so that the tls/ -directory is populated with the certificates, and on first run, cloud -secrets are populated with the signing material. - -The `custom-images/examples/secure-boot/create-key-pair.sh` script -emits bash code which can be evaluated in order to populate -appropriate environment variables. You will need to run `gcloud -config set project ${PROJECT_ID}` before running `create-key-pair.sh` -to specify the project of the secret manager service. - -```bash -$ bash custom-images/examples/secure-boot/create-key-pair.sh -modulus_md5sum=ffffffffffffffffffffffffffffffff -private_secret_name=efi-db-priv-key-042 -public_secret_name=efi-db-pub-key-042 -secret_project=your-project-id -secret_version=1 -``` - - -#### Verification +### Loading Built Kernel Module & Secure Boot + +When the script needs to build NVIDIA kernel modules from source (e.g., using +NVIDIA's open-gpu-kernel-modules repository, or if pre-built OS packages are +not suitable), special considerations apply if Secure Boot is enabled. + + * **Secure Boot Active:** Locally compiled modules must be signed with a key + trusted by the system's UEFI firmware. + * **MOK Key Signing:** Provide the Secure Boot signing metadata parameters + (listed above) to use keys stored in GCP Secret Manager. The public MOK + certificate must be enrolled in your base image's UEFI keystore. See + `GoogleCloudDataproc/custom-images/examples/secure-boot/create-key-pair.sh` + for guidance on key creation and management. + * **Disabling Secure Boot (Unsecured Workaround):** You can pass the + `--no-shielded-secure-boot` flag to `gcloud dataproc clusters create`. + This allows unsigned modules but disables Secure Boot's protections. + * **Error Indication:** If a kernel module fails to load due to signature + issues while Secure Boot is active, check `/var/log/nvidia-installer.log` + or `dmesg` output for errors like "Operation not permitted" or messages + related to signature verification failure. + +### Verification 1. Once the cluster has been created, you can access the Dataproc cluster and verify NVIDIA drivers are installed successfully. @@ -306,40 +247,81 @@ secret_version=1 sudo nvidia-smi ``` -2. If you install the GPU collection service, verify installation by using the - following command: +2. If the CUDA toolkit was installed, verify the compiler: ```bash - sudo systemctl status gpu-utilization-agent.service + /usr/local/cuda/bin/nvcc --version ``` -For more information about GPU support, take a look at -[Dataproc documentation](https://cloud.google.com/dataproc/docs/concepts/compute/gpus) +3. If you install the GPU collection service (`install-gpu-agent=true`, default), + verify installation by using the following command: -### Report metrics + ```bash + sudo systemctl status gpu-utilization-agent.service + ``` -The initialization action installs a -[monitoring agent](https://github.com/GoogleCloudPlatform/ml-on-gcp/tree/master/dlvm/gcp-gpu-utilization-metrics) -that monitors the GPU usage on the instance. This will auto create and send the -GPU metrics to the Cloud Monitoring service. + (The service should be `active (running)`). -### Troubleshooting +For more information about GPU support, take a look at +[Dataproc documentation](https://cloud.google.com/dataproc/docs/concepts/compute/gpus). -Problem: Error when running `report_gpu_metrics` +### Report Metrics -``` -google.api_core.exceptions.InvalidArgument: 400 One or more TimeSeries could not be written: -One or more points were written more frequently than the maximum sampling period configured for the metric. -:timeSeries[0] -``` +The GPU monitoring agent (installed when `install-gpu-agent=true`) automatically +collects and sends GPU utilization and memory usage metrics to Cloud Monitoring. +The agent is based on code from the +[ml-on-gcp/gcp-gpu-utilization-metrics](https://www.google.com/search?q=https://github.com/GoogleCloudPlatform/ml-on-gcp/tree/master/dlvm/gcp-gpu-utilization-metrics) +repository. The `create_gpu_metrics.py` script mentioned in older +documentation is no longer used by this initialization action, as the agent +handles metric creation and reporting. -Solution: Verify service is running in background +### Troubleshooting -```bash -sudo systemctl status gpu-utilization-agent.service -``` + * **Installation Failures:** Examine the initialization action log on the + affected node, typically `/var/log/dataproc-initialization-script-0.log` + (or a similar name if multiple init actions are used). + * **GPU Agent Issues:** If the agent was installed (`install-gpu-agent=true`), + check its service logs using `sudo journalctl -u gpu-utilization-agent.service`. + * **Driver Load or Secure Boot Problems:** Review `dmesg` output and + `/var/log/nvidia-installer.log` for errors related to module loading or + signature verification. + * **"Points written too frequently" (GPU Agent):** This was a known issue with + older versions of the `report_gpu_metrics.py` service. The current script + and agent versions aim to mitigate this. If encountered, check agent logs. ## Important notes -* This initialization script will install NVIDIA GPU drivers in all nodes in - which a GPU is detected. + * This initialization script will install NVIDIA GPU drivers in all nodes in + which a GPU is detected. If no GPUs are present on a node, most + GPU-specific installation steps are skipped. + * **Performance & Caching:** + * The script extensively caches downloaded artifacts (drivers, CUDA `.run` + files) and compiled components (kernel modules, NCCL, Conda environments) + to a GCS bucket. This bucket is typically specified by the + `dataproc-temp-bucket` cluster property or metadata. + * **First Run / Cache Warming:** Initial runs on new configurations (OS, + kernel, or driver version combinations) that require source compilation + (e.g., for NCCL or kernel modules when no pre-compiled version is + available or suitable) can be time-consuming. + * On small instances (e.g., 2-core nodes), this process can take + up to **150 minutes**. + * To optimize and avoid long startup times on production clusters, + it is highly recommended to "pre-warm" the GCS cache. This can be + done by running the script once on a temporary, larger instance + (e.g., a single-node, 32-core machine) with your target OS and + desired GPU configuration. This will build and cache the necessary + components. Subsequent cluster creations using the same cache bucket + will be significantly faster (e.g., the init action might take + 12-20 minutes on a large instance for the initial build, and then + much faster on subsequent nodes using the cache). + * **Security Benefit of Caching:** When the script successfully finds and + uses cached, pre-built artifacts, it often bypasses the need to + install build tools (e.g., `gcc`, `kernel-devel`, `make`) on the + cluster nodes. This reduces the attack surface area of the + resulting cluster instances. + * SSHD configuration is hardened by default by the script. + * The script includes logic to manage APT sources and GPG keys for + Debian-based systems, including handling of archived backports repositories + to ensure dependencies can be met. + * Tested primarily with Dataproc 2.0+ images. Support for older Dataproc + 1.5 images is limited. \ No newline at end of file diff --git a/gpu/gpu_test_case_base.py b/gpu/gpu_test_case_base.py new file mode 100644 index 000000000..b34381e65 --- /dev/null +++ b/gpu/gpu_test_case_base.py @@ -0,0 +1,136 @@ +import os +import time +import random +from packaging import version +from integration_tests.dataproc_test_case import DataprocTestCase + +DEFAULT_TIMEOUT = 45 # minutes + +class GpuTestCaseBase(DataprocTestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run_dataproc_job(self, + cluster_name, + job_type, + job_params, + timeout_in_minutes=DEFAULT_TIMEOUT): + """Executes Dataproc job on a cluster and returns results. + + Args: + cluster_name: cluster name to submit job to + job_type: type of the job, e.g. spark, hadoop, pyspark + job_params: job parameters + timeout_in_minutes: timeout in minutes + + Returns: + ret_code: the return code of the job + stdout: standard output of the job + stderr: error output of the job + """ + + ret_code, stdout, stderr = DataprocTestCase.run_command( + 'gcloud dataproc jobs submit {} --cluster={} --region={} {}'. + format(job_type, cluster_name, self.REGION, + job_params), timeout_in_minutes) + return ret_code, stdout, stderr + + # Tests for PyTorch + TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" + + # Tests for TensorFlow + TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py" + + def assert_instance_command(self, + instance, + cmd, + timeout_in_minutes=DEFAULT_TIMEOUT): + retry_count = 5 + ssh_cmd = 'gcloud compute ssh -q {} --zone={} --command="{}" -- -o ConnectTimeout=60 -o StrictHostKeyChecking=no'.format( + instance, self.cluster_zone, cmd.replace('"', '\"')) + + while retry_count > 0: + try: + # Use self.assert_command from DataprocTestCase + ret_code, stdout, stderr = self.assert_command(ssh_cmd, timeout_in_minutes) + return ret_code, stdout, stderr + except Exception as e: + print(f"An error occurred in assert_instance_command: {e}") + retry_count -= 1 + if retry_count > 0: + print(f"Retrying in 10 seconds...") + time.sleep(10) + continue + else: + print("Max retries reached.") + raise + + def verify_instance(self, name): + # Verify that nvidia-smi works + self.assert_instance_command(name, "nvidia-smi", 1) + print(f"OK: nvidia-smi on {name}") + + def verify_instance_gpu_agent(self, name): + print(f"--- Verifying GPU Agent on {name} ---") + self.assert_instance_command( + name, "systemctl is-active gpu-utilization-agent.service") + print(f"OK: GPU Agent on {name}") + + def get_dataproc_image_version(self, instance): + _, stdout, _ = self.assert_instance_command(instance, "grep DATAPROC_IMAGE_VERSION /etc/environment | cut -d= -f2") + return stdout.strip() + + def version_lt(self, v1, v2): + return version.parse(v1) < version.parse(v2) + + def verify_pytorch(self, name): + print(f"--- Verifying PyTorch on {name} ---") + test_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu", + self.TORCH_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + + image_version = self.get_dataproc_image_version(name) + conda_root_path = "/opt/conda/miniconda3" + if not self.version_lt(image_version, "2.3"): + conda_root_path = "/opt/conda" + + conda_env = "dpgce" + env_path = f"{conda_root_path}/envs/{conda_env}" + python_bin = f"{env_path}/bin/python3" + + verify_cmd = ( + f"for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node; do " + f" if [[ -e \\\"$f\\\" ]]; then echo 0 > \\\"$f\\\"; fi; " + f"done; " + f"if /usr/share/google/get_metadata_value attributes/include-pytorch; then" + f" {python_bin} {self.TORCH_TEST_SCRIPT_FILE_NAME}; " + f"else echo 'PyTorch test skipped as include-pytorch is not set'; fi" + ) + _, stdout, _ = self.assert_instance_command(name, verify_cmd) + if "PyTorch test skipped" not in stdout: + self.assertTrue("True" in stdout, f"PyTorch CUDA not available or python not found in {env_path}") + print(f"OK: PyTorch on {name}") + self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) + + def verify_tensorflow(self, name): + print(f"--- Verifying TensorFlow on {name} ---") + test_filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "gpu", + self.TF_TEST_SCRIPT_FILE_NAME) + self.upload_test_file(test_filename, name) + + image_version = self.get_dataproc_image_version(name) + conda_root_path = "/opt/conda/miniconda3" + if not self.version_lt(image_version, "2.3"): + conda_root_path = "/opt/conda" + + conda_env="dpgce" + env_path = f"{conda_root_path}/envs/{conda_env}" + python_bin = f"{env_path}/bin/python3" + + verify_cmd = ( + f"for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${{f}} ; done ;" + f"{python_bin} {self.TF_TEST_SCRIPT_FILE_NAME}" + ) + self.assert_instance_command(name, verify_cmd) + print(f"OK: TensorFlow on {name}") + self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 673df668c..9a1ee94cd 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -114,7 +114,9 @@ function get_metadata_value() { function get_metadata_attribute() { local -r attribute_name="$1" local -r default_value="${2:-}" + set +e get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + set -e } OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" @@ -459,11 +461,8 @@ function set_cuda_runfile_url() { elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" fi - } - - function set_cudnn_tarball_url() { CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" @@ -491,7 +490,12 @@ GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') readonly GPU_DRIVER_PROVIDER # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver -INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'true') +INSTALL_GPU_AGENT_METADATA=$(get_metadata_attribute 'install-gpu-agent' 'true') +ENABLE_GPU_MONITORING_METADATA=$(get_metadata_attribute 'enable-gpu-monitoring' 'true') +INSTALL_GPU_AGENT='true' +if [[ "${INSTALL_GPU_AGENT_METADATA}" == "false" ]] || [[ "${ENABLE_GPU_MONITORING_METADATA}" == "false" ]] ; then + INSTALL_GPU_AGENT='false' +fi readonly INSTALL_GPU_AGENT # Dataproc configurations @@ -503,6 +507,8 @@ NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 +IS_CUSTOM_IMAGE_BUILD="false" # Default + function execute_with_retries() ( local -r cmd="$*" @@ -816,7 +822,7 @@ function install_nvidia_cudnn() { sync else - echo "Unsupported cudnn version: [\"${CUDNN_VERSION}\"]" + echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi fi else @@ -835,11 +841,17 @@ function install_pytorch() { local env env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') - local mc3=/opt/conda/miniconda3 - [[ -d ${mc3} ]] || return - local envpath="${mc3}/envs/${env}" + + local conda_root_path + if version_lt "${DATAPROC_IMAGE_VERSION}" "2.3" ; then + conda_root_path="/opt/conda/miniconda3" + else + conda_root_path="/opt/conda" + fi + [[ -d ${conda_root_path} ]] || return + local envpath="${conda_root_path}/envs/${env}" if [[ "${env}" == "base" ]]; then - echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${mc3}" ; fi + echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${conda_root_path}" ; fi # Set numa node to 0 for all GPUs for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done @@ -883,7 +895,7 @@ function install_pytorch() { if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi # Install pytorch and company to this environment - "${mc3}/bin/mamba" "${verb}" -n "${env}" \ + "${conda_root_path}/bin/mamba" "${verb}" -n "${env}" \ -c conda-forge -c nvidia -c rapidsai \ numba pytorch tensorflow[and-cuda] rapids pyspark \ "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" @@ -996,7 +1008,6 @@ function add_contrib_component() { elif is_debian ; then sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list fi - return 0 } function add_nonfree_components() { @@ -1020,14 +1031,33 @@ function add_repo_nvidia_container_toolkit() { local signing_key_url="${nvctk_root}/gpgkey" local repo_data - if is_debuntu ; then repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" - else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" + # Since there are more than one keys to go into this keychain, we can't call os_add_repo, which only works with one + if is_debuntu ; then + # "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + local -r repo_name="nvidia-container-toolkit" + local -r kr_path="/usr/share/keyrings/${repo_name}.gpg" + GPG_PROXY_ARGS="" + if [[ -v HTTP_PROXY ]] ; then + GPG_PROXY="--keyserver-options http-proxy=${HTTP_PROXY}" + elif [[ -v http_proxy ]] ; then + GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" + fi + execute_with_retries gpg --keyserver keyserver.ubuntu.com \ + ${GPG_PROXY_ARGS} \ + --no-default-keyring --keyring "${kr_path}" \ + --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" + local -r repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" + local -r repo_path="/etc/apt/sources.list.d/${repo_name}.list" + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + execute_with_retries apt-get update + else + repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" + os_add_repo nvidia-container-toolkit \ + "${signing_key_url}" \ + "${repo_data}" \ + "no" fi - - os_add_repo nvidia-container-toolkit \ - "${signing_key_url}" \ - "${repo_data}" \ - "no" } function add_repo_cuda() { @@ -1038,7 +1068,13 @@ function add_repo_cuda() { echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ | sudo tee "${sources_list_path}" - gpg --keyserver keyserver.ubuntu.com \ + GPG_PROXY_ARGS="" + if [[ -n "${HTTP_PROXY}" ]] ; then + GPG_PROXY="--keyserver-options http-proxy=${HTTP_PROXY}" + elif [[ -n "${http_proxy}" ]] ; then + GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" + fi + execute_with_retries gpg --keyserver keyserver.ubuntu.com ${GPG_PROXY_ARGS} \ --no-default-keyring --keyring "${kr_path}" \ --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" else @@ -1059,9 +1095,9 @@ function build_driver_from_github() { pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" - curl ${curl_retry_args} \ + execute_with_retries curl ${curl_retry_args} \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ - | tar xz + \| tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } @@ -1089,7 +1125,7 @@ function build_driver_from_github() { local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + ${gsutil_cmd} rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" break fi sleep 5m @@ -1182,12 +1218,13 @@ function build_driver_from_packages() { function install_nvidia_userspace_runfile() { # Parameters for NVIDIA-provided Debian GPU driver - readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" + local -r USERSPACE_RUNFILE="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" - readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") + local -r DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/${USERSPACE_RUNFILE}" - USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" - readonly USERSPACE_FILENAME + local USERSPACE_URL + USERSPACE_URL="$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")" + readonly USERSPACE_URL # This .run file contains NV's OpenGL implementation as well as # nvidia optimized implementations of the gtk+ 2,3 stack(s) not @@ -1200,12 +1237,17 @@ function install_nvidia_userspace_runfile() { # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. is_complete userspace && return - local local_fn="${tmpdir}/userspace.run" + local local_fn="${tmpdir}/${USERSPACE_RUNFILE}" cache_fetched_package "${USERSPACE_URL}" \ - "${pkg_bucket}/nvidia/${USERSPACE_FILENAME}" \ + "${pkg_bucket}/nvidia/${USERSPACE_RUNFILE}" \ "${local_fn}" + local runfile_sha256sum + runfile_sha256sum="$(cd "${tmpdir}" && sha256sum "${USERSPACE_RUNFILE}")" + local runfile_hash + runfile_hash=$(echo "${runfile_sha256sum}" | awk '{print $1}') + local runfile_args runfile_args="" local cache_hit="0" @@ -1315,7 +1357,7 @@ function install_nvidia_userspace_runfile() { function install_cuda_runfile() { is_complete cuda && return - local local_fn="${tmpdir}/cuda.run" + local local_fn="${tmpdir}/${CUDA_RUNFILE}" cache_fetched_package "${NVIDIA_CUDA_URL}" \ "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \ @@ -1378,7 +1420,6 @@ function install_cuda(){ add_repo_cuda mark_complete cuda-repo - return 0 } function install_nvidia_container_toolkit() { @@ -1464,6 +1505,9 @@ function install_gpu_agent() { local venv="${install_dir}/venv" python_interpreter="/opt/conda/miniconda3/bin/python3" [[ -f "${python_interpreter}" ]] || python_interpreter="$(command -v python3)" + if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" && is_debuntu ; then + execute_with_retries "apt-get install -y -qq python3-venv" + fi "${python_interpreter}" -m venv "${venv}" ( source "${venv}/bin/activate" @@ -1531,7 +1575,6 @@ function configure_yarn_resources() { # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { - if [[ "${gpu_count}" == "0" ]] ; then return ; fi set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ @@ -1562,7 +1605,6 @@ function configure_yarn_nodemanager() { } function configure_gpu_exclusive_mode() { - if [[ "${gpu_count}" == "0" ]] ; then return ; fi # only run this function when spark < 3.0 if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU @@ -1572,13 +1614,12 @@ function configure_gpu_exclusive_mode() { function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts sudo chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh + execute_with_retries wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi + execute_with_retries wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh sudo chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { - if [[ "${gpu_count}" == "0" ]] ; then return ; fi # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' mkdir -p ${spark_gpu_script_dir} @@ -1644,10 +1685,8 @@ EOF # images, we must configure the Fair scheduler version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" || return - # TODO: when running this script to customize an image, this file - # needs to be written *after* bdutil completes - - cat >>"${spark_defaults_conf}" <>"${spark_defaults_conf}" < "${install_log}" 2>&1 + local retval="$?" + set -e + + if [[ "${retval}" == "0" ]] ; then return ; fi + + if grep -q 'Status code: 404 for https' "${install_log}" ; then + local stg_url="https://download.rockylinux.org/stg/rocky/${os_ver}/devel/x86_64/os/Packages/k/" + dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + "${stg_url}/kernel-${uname_r}.rpm" \ + "${stg_url}/kernel-core-${uname_r}.rpm" \ + "${stg_url}/kernel-modules-${uname_r}.rpm" \ + "${stg_url}/kernel-modules-core-${uname_r}.rpm" \ + "${stg_url}/kernel-devel-${uname_r}.rpm" + )" + fi + execute_with_retries "${dnf_cmd}" fi mark_complete build-dependencies @@ -1862,7 +1920,9 @@ function hold_nvidia_packages() { function check_secure_boot() { local SECURE_BOOT="disabled" - SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + if command -v mokutil ; then + SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') + fi PSN="$(get_metadata_attribute private_secret_name)" readonly PSN @@ -1871,10 +1931,10 @@ function check_secure_boot() { echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." exit 1 elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then - echo "Secure boot is enabled, but no signing material provided." + echo "Error: Secure boot is enabled, but no signing material provided." echo "Please either disable secure boot or provide signing material as per" echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" - exit 1 + return 1 fi CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" @@ -1884,38 +1944,296 @@ function check_secure_boot() { mok_der=/var/lib/shim-signed/mok/MOK.der else mok_key=/var/lib/dkms/mok.key mok_der=/var/lib/dkms/mok.pub ; fi + return 0 } -function main() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs +# Function to group Hadoop/Spark config steps (called in init-action mode or deferred) +function run_hadoop_spark_config() { + # Ensure necessary variables are available or re-evaluated + # prepare_gpu_env needs CUDA/Driver versions, call it first if needed + # Set GCS bucket for caching + if [[ ! -v pkg_bucket ]] ; then + temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" + readonly temp_bucket + readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + fi + if [[ ! -v CUDA_VERSION || ! -v DRIVER_VERSION ]]; then prepare_gpu_env; fi + # Re-read ROLE + ROLE="$(get_metadata_attribute dataproc-role)"; + # Re-read SPARK_VERSION if not set or default + if [[ ! -v SPARK_VERSION || "${SPARK_VERSION}" == "0.0" ]]; then + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1 || echo "0.0")" + fi + # Re-check GPU count + set +e + gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" + set -e + # Re-check MIG status + IS_MIG_ENABLED=0 + NVIDIA_SMI_PATH='/usr/bin' # Reset default path + MIG_MAJOR_CAPS=0 + if [[ "${gpu_count}" -gt "0" ]] && nvsmi >/dev/null 2>&1; then # Check if nvsmi works before querying + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader || echo '[N/A]')" + if [[ "${migquery_result}" != "[N/A]" && "${migquery_result}" != "" ]]; then + NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" + if [[ "${NUM_MIG_GPUS}" -eq "1" ]] && (echo "${migquery_result}" | grep -q Enabled); then + IS_MIG_ENABLED=1 + NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' # Set MIG path + MIG_MAJOR_CAPS=$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1 || echo 0) + if [[ ! -d "/usr/local/yarn-mig-scripts" ]]; then fetch_mig_scripts || echo "WARN: Failed to fetch MIG scripts." >&2; fi + fi + fi + fi + + # Ensure config directories exist + if [[ ! -d "${HADOOP_CONF_DIR}" || ! -d "${SPARK_CONF_DIR}" ]]; then + echo "ERROR: Config directories (${HADOOP_CONF_DIR}, ${SPARK_CONF_DIR}) not found. Cannot apply configuration." + return 1 # Use return instead of exit in a function + fi + + # Run config applicable to all nodes configure_yarn_resources - # Detect NVIDIA GPU - if (grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent); then - # if this is called without the MIG script then the drivers are not installed - migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader)" + # Run node-specific config + if [[ "${gpu_count}" -gt 0 ]]; then + configure_yarn_nodemanager + install_spark_rapids # Installs JARs + configure_gpu_script + configure_gpu_isolation + configure_gpu_exclusive_mode # Call this here, it checks Spark version internally + elif [[ "${ROLE}" == "Master" ]]; then + # Master node without GPU still needs some config + configure_yarn_nodemanager + install_spark_rapids # Still need JARs on Master + configure_gpu_script + else + # Worker node without GPU, skip node-specific YARN/Spark config. + : + fi + + return 0 # Explicitly return success +} + +# This function now ONLY generates the script and service file. +# It does NOT enable the service here. +function create_deferred_config_files() { + local -r service_name="dataproc-gpu-config" + local -r service_file="/etc/systemd/system/${service_name}.service" + # This is the script that will contain the config logic + local -r config_script_path="/usr/local/sbin/apply-dataproc-gpu-config.sh" + + # Use 'declare -f' to extract function definitions needed by the config logic + # and write them, along with the config logic itself, into the new script. + cat < "${config_script_path}" +#!/bin/bash +# Deferred configuration script generated by install_gpu_driver.sh +set -xeuo pipefail + +# --- Minimal necessary functions and variables --- +# Define constants +readonly HADOOP_CONF_DIR='/etc/hadoop/conf' +readonly SPARK_CONF_DIR='/etc/spark/conf' +readonly bdcfg="/usr/local/bin/bdconfig" +readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package + +# --- Define Necessary Global Arrays --- +# These need to be explicitly defined here as they are not functions. +$(declare -p DRIVER_FOR_CUDA) +$(declare -p DRIVER_SUBVER) +$(declare -p CUDNN_FOR_CUDA) +$(declare -p NCCL_FOR_CUDA) +$(declare -p CUDA_SUBVER) +# drv_for_cuda is defined within set_cuda_runfile_url, which is included below + +# Define minimal metadata functions +$(declare -f print_metadata_value) +$(declare -f print_metadata_value_if_exists) +$(declare -f get_metadata_value) +$(declare -f get_metadata_attribute) + +# Define nvsmi wrapper +$(declare -f nvsmi) +nvsmi_works="0" # Initialize variable used by nvsmi + +# Define version comparison +$(declare -f version_ge) +$(declare -f version_gt) +$(declare -f version_le) +$(declare -f version_lt) + +# Define OS check functions +$(declare -f os_id) +$(declare -f os_version) +$(declare -f os_codename) # Added os_codename as it's used by clean_up_sources_lists indirectly via os_add_repo +$(declare -f is_debian) +$(declare -f is_ubuntu) +$(declare -f is_rocky) +$(declare -f is_debuntu) +$(declare -f is_debian10) +$(declare -f is_debian11) +$(declare -f is_debian12) +$(declare -f is_rocky8) +$(declare -f is_rocky9) +$(declare -f is_ubuntu18) +$(declare -f is_ubuntu20) +$(declare -f is_ubuntu22) +$(declare -f ge_debian12) +$(declare -f le_debian10) +$(declare -f le_debian11) +$(declare -f ge_ubuntu20) +$(declare -f le_ubuntu18) +$(declare -f ge_rocky9) +$(declare -f os_vercat) # Added os_vercat as it's used by set_nv_urls/set_cuda_runfile_url +# Define _shortname (needed by install_spark_rapids -> cache_fetched_package and others) +readonly _shortname="\$(os_id)\$(os_version|perl -pe 's/(\\d+).*/\$1/')" +# Define shortname and nccl_shortname (needed by set_nv_urls) +if is_ubuntu22 ; then + nccl_shortname="ubuntu2004" + shortname="\$(os_id)\$(os_vercat)" +elif ge_rocky9 ; then + nccl_shortname="rhel8" + shortname="rhel9" +elif is_rocky ; then + shortname="\$(os_id | sed -e 's/rocky/rhel/')\$(os_vercat)" + nccl_shortname="\${shortname}" +else + shortname="\$(os_id)\$(os_vercat)" + nccl_shortname="\${shortname}" +fi +readonly shortname nccl_shortname + +# Define prepare_gpu_env and its dependencies +$(declare -f prepare_gpu_env) +$(declare -f set_cuda_version) +$(declare -f set_driver_version) +$(declare -f set_nv_urls) +$(declare -f set_cuda_runfile_url) +$(declare -f set_cudnn_version) +$(declare -f set_cudnn_tarball_url) +$(declare -f is_cuda11) +$(declare -f is_cuda12) +$(declare -f le_cuda11) +$(declare -f le_cuda12) +$(declare -f ge_cuda11) +$(declare -f ge_cuda12) +$(declare -f is_cudnn8) +$(declare -f is_cudnn9) + +# Define DATAPROC_IMAGE_VERSION (re-evaluate) +SPARK_VERSION="\$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1 || echo "0.0")" +if version_lt "\${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5" +elif version_lt "\${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" +elif version_lt "\${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" +elif version_lt "\${SPARK_VERSION}" "3.6" ; then + if [[ -f /etc/environment ]] ; then + eval "\$(grep '^DATAPROC_IMAGE_VERSION' /etc/environment)" || DATAPROC_IMAGE_VERSION="2.2" + else + DATAPROC_IMAGE_VERSION="2.2" + fi +else DATAPROC_IMAGE_VERSION="2.3" ; fi # Default to latest known version +readonly DATAPROC_IMAGE_VERSION + +# Define set_hadoop_property +$(declare -f set_hadoop_property) + +# --- Include definitions of functions called by the config logic --- +$(declare -f configure_yarn_resources) +$(declare -f configure_yarn_nodemanager) +$(declare -f install_spark_rapids) +$(declare -f configure_gpu_script) +$(declare -f configure_gpu_isolation) +$(declare -f configure_gpu_exclusive_mode) +$(declare -f fetch_mig_scripts) +$(declare -f cache_fetched_package) +$(declare -f execute_with_retries) + +# --- Define gsutil/gcloud commands and curl args --- +gsutil_cmd="gcloud storage" +gsutil_stat_cmd="gcloud storage objects describe" +gcloud_sdk_version="\$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print \$2}' || echo '0.0.0')" +if version_lt "\${gcloud_sdk_version}" "402.0.0" ; then + gsutil_cmd="gsutil -o GSUtil:check_hashes=never" + gsutil_stat_cmd="gsutil stat" +fi +curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" + +# --- Include the main config function --- +$(declare -f run_hadoop_spark_config) + +# --- Execute the config logic --- +if run_hadoop_spark_config; then + # Configuration successful, disable the service + systemctl disable ${service_name}.service + rm -f "${config_script_path}" "${service_file}" + systemctl daemon-reload +else + echo "ERROR: Deferred configuration script (${config_script_path}) failed." >&2 + # Keep the service enabled to allow for manual inspection/retry + exit 1 +fi + +# Restart services after applying config +for svc in resourcemanager nodemanager; do + if (systemctl is-active --quiet hadoop-yarn-\${svc}.service); then + systemctl stop hadoop-yarn-\${svc}.service || echo "WARN: Failed to stop \${svc}" + systemctl start hadoop-yarn-\${svc}.service || echo "WARN: Failed to start \${svc}" + fi +done + +exit 0 +EOF + + chmod +x "${config_script_path}" + + cat < "${service_file}" +[Unit] +Description=Apply Dataproc GPU configuration on first boot +# Ensure it runs after Dataproc agent and YARN services are likely up +After=google-dataproc-agent.service network-online.target hadoop-yarn-resourcemanager.service hadoop-yarn-nodemanager.service +Wants=network-online.target google-dataproc-agent.service + +[Service] +Type=oneshot +ExecStart=${config_script_path} # Execute the generated config script +RemainAfterExit=no # Service is done after exec +StandardOutput=journal+console +StandardError=journal+console + +[Install] +WantedBy=multi-user.target +EOF + + chmod 644 "${service_file}" + # Service is enabled later only if IS_CUSTOM_IMAGE_BUILD is true +} + + +function main() { + # Perform installations (these are generally safe during image build) + if (grep -qi PCI_ID=10DE /sys/bus/pci/devices/*/uevent); then + + # Check MIG status early, primarily for driver installation logic + migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader || echo '[N/A]')" # Use || for safety if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then + if [[ "${NUM_MIG_GPUS}" -gt 0 ]] ; then if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then if (echo "${migquery_result}" | grep Enabled); then IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts + # Fetch MIG scripts early if needed by driver install/check + if [[ ! -d "/usr/local/yarn-mig-scripts" ]]; then fetch_mig_scripts || echo "WARN: Failed to fetch MIG scripts." >&2; fi fi fi fi - # if mig is enabled drivers would have already been installed + # Install core components if MIG is not already enabled (MIG setup implies drivers exist) if [[ $IS_MIG_ENABLED -eq 0 ]]; then install_nvidia_gpu_driver install_nvidia_container_toolkit install_cuda - load_kernel_module + load_kernel_module # Load modules after driver install if [[ -n ${CUDNN_VERSION} ]]; then install_nvidia_nccl @@ -1953,6 +2271,7 @@ function main() { nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 else nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 + fi done @@ -1969,16 +2288,33 @@ function main() { configure_gpu_script configure_gpu_isolation elif [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - configure_gpu_script - fi - - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ $(systemctl show hadoop-yarn-${svc}.service -p SubState --value) == 'running' ]]; then - systemctl restart hadoop-yarn-${svc}.service + # Master node without GPU detected. + : + else + # Worker node without GPU detected. + : + fi # End GPU detection + + # --- Generate Config Script and Service File --- + # This happens in both modes now + create_deferred_config_files + + # --- Apply or Defer Configuration --- + if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then + # Enable the systemd service for first boot + systemctl enable "dataproc-gpu-config.service" + else + # Running as a standard init action: execute the generated script immediately + local -r config_script_path="/usr/local/sbin/apply-dataproc-gpu-config.sh" + if [[ -x "${config_script_path}" ]]; then + bash -x "${config_script_path}" + else + echo "ERROR: Generated config script ${config_script_path} not found or not executable." + exit 1 fi - done + # The config script handles its own cleanup and service disabling on success + fi + # --- End Apply or Defer --- } function cache_fetched_package() { @@ -1995,6 +2331,7 @@ function cache_fetched_package() { } function clean_up_sources_lists() { + if ! is_debuntu; then return; fi # # bigtop (primary) # @@ -2108,12 +2445,14 @@ function exit_handler() { if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi fi - set +e + set +e # Allow cleanup commands to fail without exiting script echo "Exit handler invoked" # Clear pip cache + # TODO: make this conditional on which OSs have pip without cache purge pip cache purge || echo "unable to purge pip cache" + # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then # remove the tmpfs pip cache-dir @@ -2148,7 +2487,7 @@ function exit_handler() { /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ /usr/lib \ /opt/nvidia/* \ - /opt/conda/miniconda3 | sort -h + /opt/conda/miniconda3 2>/dev/null | sort -h elif is_debian ; then du -x -hs \ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ @@ -2159,13 +2498,13 @@ function exit_handler() { /usr \ /var \ / 2>/dev/null | sort -h - else + else # Rocky du -hs \ /var/lib/docker \ /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ /usr/lib64/google-cloud-sdk \ /opt/nvidia/* \ - /opt/conda/miniconda3 + /opt/conda/miniconda3 2>/dev/null | sort -h fi # Process disk usage logs from installation period @@ -2182,7 +2521,7 @@ function exit_handler() { unshift(@samples,$first); $final=$samples[-1]; ($starting)=(split(/\s+/,$first))[2] =~ /^(\d+)/; ($ending)=(split(/\s+/,$final))[2] =~ /^(\d+)/; - @siz=( sort { $a => $b } + @siz=( sort { $a <= $b } map { (split)[2] =~ /^(\d+)/ } @samples ); $max=$siz[0]; $min=$siz[-1]; $inc=$max-$starting; print( " samples-taken: ", scalar @siz, $/, @@ -2194,12 +2533,12 @@ print( " samples-taken: ", scalar @siz, $/, echo "exit_handler has completed" - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then - dd if=/dev/zero of=/zero + # zero free disk space (only if creating image) + if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then + dd if=/dev/zero of=/zero status=progress || true sync sleep 3s - rm -f /zero + rm -f /zero || true fi return 0 @@ -2373,7 +2712,6 @@ function mount_ramdisk(){ if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi # Write to a ramdisk instead of churning the persistent disk - tmpdir="/mnt/shm" mkdir -p "${tmpdir}/pkgs_dirs" mount -t tmpfs tmpfs "${tmpdir}" @@ -2429,6 +2767,17 @@ function prepare_to_install(){ check_secure_boot set_proxy + # --- Detect Image Build Context --- + # Use 'initialization-actions' as the default name for clarity + INVOCATION_TYPE="$(get_metadata_attribute invocation-type "initialization-actions")" + if [[ "${INVOCATION_TYPE}" == "custom-images" ]]; then + IS_CUSTOM_IMAGE_BUILD="true" + # echo "Detected custom image build context (invocation-type=custom-images). Configuration will be deferred." # Keep silent + else + IS_CUSTOM_IMAGE_BUILD="false" # Ensure it's explicitly false otherwise + # echo "Running in initialization action mode (invocation-type=${INVOCATION_TYPE})." # Keep silent + fi + # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" @@ -2438,23 +2787,48 @@ function prepare_to_install(){ gsutil_cmd="gsutil -o GSUtil:check_hashes=never" gsutil_stat_cmd="gsutil stat" fi + + # if fetches of nvidia packages fail, apply -k argument to the following. + curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" + # After manually verifying the veracity of the asset, take note of sha256sum + # of the downloaded files in your gcs bucket and submit these data with an + # issue or pull request to the github repository + # GoogleCloudDataproc/initialization-actions and we will include those hashes + # with this script for manual validation at time of deployment. + + # Please provide hash data in the following format: + +# ["cuda_11.5.2_495.29.05_linux.run"]="2c33591bb5b33a3d4bffafdc7da76fe4" +# ["cuda_11.6.2_510.47.03_linux.run"]="2989d2d2a943fa5e2a1f29f660221788" +# ["cuda_12.1.1_530.30.02_linux.run"]="2f0a4127bf797bf4eab0be2a547cb8d0" +# ["cuda_12.4.1_550.54.15_linux.run"]="afc99bab1d8c6579395d851d948ca3c1" +# ["cuda_12.6.3_560.35.05_linux.run"]="29d297908c72b810c9ceaa5177142abd" +# ["NVIDIA-Linux-x86_64-495.46.run"]="db1d6b0f9e590249bbf940a99825f000" +# ["NVIDIA-Linux-x86_64-510.108.03.run"]="a225bcb0373cbf6c552ed906bc5c614e" +# ["NVIDIA-Linux-x86_64-530.30.02.run"]="655b1509b9a9ed0baa1ef6b2bcf80283" +# ["NVIDIA-Linux-x86_64-550.135.run"]="a8c3ae0076f11e864745fac74bfdb01f" +# ["NVIDIA-Linux-x86_64-550.142.run"]="e507e578ecf10b01a08e5424dddb25b8" + + # Setup temporary directories (potentially on RAM disk) + tmpdir=/tmp/ # Default + mount_ramdisk # Updates tmpdir if successful + install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir + workdir=/opt/install-dpgce - tmpdir=/tmp/ + # Set GCS bucket for caching temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" readonly temp_bucket readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive + # Prepare GPU environment variables (versions, URLs, counts) prepare_gpu_env mkdir -p "${workdir}/complete" trap exit_handler EXIT - mount_ramdisk - - readonly install_log="${tmpdir}/install.log" is_complete prepare.common && return @@ -2469,14 +2843,15 @@ function prepare_to_install(){ if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi if is_ubuntu ; then + # Wait for gcloud to be available on Ubuntu while ! command -v gcloud ; do sleep 5s ; done fi - else + else # Rocky dnf clean all fi - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e + # zero free disk space (only if creating image) + if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then ( set +e time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) fi @@ -2507,23 +2882,27 @@ function check_os() { readonly SPARK_VERSION if version_lt "${SPARK_VERSION}" "2.4" || \ version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + echo "Error: Your Spark version (${SPARK_VERSION}) is not supported. Please use a supported version." exit 1 fi # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then + if (! test -v DATAPROC_IMAGE_VERSION || [[ -z "${DATAPROC_IMAGE_VERSION}" ]]) ; then if test -v DATAPROC_VERSION ; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" else # When building custom-images, neither of the above variables # are defined and we need to make a reasonable guess - if version_lt "${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5" elif version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi + elif version_lt "${SPARK_VERSION}" "3.6" ; then + if [[ -f /etc/environment ]] ; then + eval "$(grep '^DATAPROC_IMAGE_VERSION' /etc/environment)" || DATAPROC_IMAGE_VERSION="2.2" + else + DATAPROC_IMAGE_VERSION="2.2" + fi + else DATAPROC_IMAGE_VERSION="2.3" ; fi # Default to latest known version fi fi } @@ -2613,23 +2992,25 @@ function install_spark_rapids() { local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' local jar_basename + local spark_jars_dir="/usr/lib/spark/jars" + mkdir -p "${spark_jars_dir}" jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" + "${spark_jars_dir}/${jar_basename}" jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" + "${spark_jars_dir}/${jar_basename}" jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" + "${spark_jars_dir}/${jar_basename}" } -prepare_to_install - -main +# --- Script Entry Point --- +prepare_to_install # Run preparation steps first +main # Call main logic diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 593523b7d..d6c86bd8c 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -180,8 +180,8 @@ def verify_driver_signature(self, name): def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("disabling rocky9 builds due to out of date base dataproc image") +# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") metadata = "install-gpu-agent=false" if configuration == 'SINGLE' \ @@ -213,8 +213,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("disabling rocky9 builds due to out of date base dataproc image") +# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") @@ -250,8 +250,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("disabling rocky9 builds due to out of date base dataproc image") +# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): @@ -300,8 +300,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): - if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("disabling rocky9 builds due to out of date base dataproc image") +# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") # Operation [projects/.../regions/.../operations/...] failed: # Invalid value for field 'resource.machineType': \ @@ -344,8 +344,8 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("disabling rocky9 builds due to out of date base dataproc image") +# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ @@ -379,8 +379,8 @@ def test_gpu_allocation(self, configuration, master_accelerator, def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("disabling rocky9 builds due to out of date base dataproc image") +# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): +# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ diff --git a/gpu/verify_cluster.py b/gpu/verify_cluster.py new file mode 100644 index 000000000..5b07de62a --- /dev/null +++ b/gpu/verify_cluster.py @@ -0,0 +1,133 @@ +import argparse +import os +import time +from gpu.gpu_test_case_base import GpuTestCaseBase + +import json + +class VerifyCluster(GpuTestCaseBase): + def __init__(self, cluster_name, region, zone): + super().__init__() + self.cluster_name = cluster_name + self.name = cluster_name # Set self.name for DataprocTestCase methods + self.REGION = region # Set REGION for DataprocTestCase + self.cluster_region = region + self.cluster_zone = zone + # Mock other necessary DataprocTestCase attributes if needed + self.datetime = time.strftime("%Y%m%d-%H%M%S") + self.random = self.random_str(4) + + def check_cluster_status(self): + print(f"--- Checking status of cluster {self.getClusterName()} ---") + cmd = "gcloud dataproc clusters describe {} --region={} --format=json".format( + self.getClusterName(), self.cluster_region) + ret_code, stdout, stderr = self.run_command(cmd) + + if ret_code != 0: + print(f"ERROR: Failed to get cluster status for {self.getClusterName()}.") + print(f"STDERR: {stderr}") + exit(1) + + try: + cluster_info = json.loads(stdout) + status = cluster_info.get('status', {}).get('state') + if status == 'RUNNING': + print(f"Cluster {self.getClusterName()} is RUNNING.") + else: + print(f"ERROR: Cluster {self.getClusterName()} is not in RUNNING state. Current state: {status}") + exit(1) + except json.JSONDecodeError: + print(f"ERROR: Failed to parse cluster describe output as JSON.") + print(f"STDOUT: {stdout}") + exit(1) + + def verify_instance_spark(self): + print(f"--- Verifying Spark on {self.getClusterName()} ---") + ret_code, stdout, stderr = self.run_dataproc_job( + self.getClusterName(), + "spark", + f"--region={self.cluster_region} " + "--class=org.apache.spark.examples.SparkPi " + "--jars=file:///usr/lib/spark/examples/jars/spark-examples.jar " + "-- 10" # Reduced iterations for faster test + ) + + if ret_code != 0: + if "is in state ERROR and cannot accept jobs" in stderr: + print(f"SPARK JOB FAILED: Cluster {self.getClusterName()} is in ERROR state. Agent on master may be down.") + # Optionally print a more detailed guide + print(" Check: systemctl status google-dataproc-agent on the master node.") + print(" Logs: journalctl -u google-dataproc-agent on the master node.") + else: + print(f"SPARK JOB FAILED: ret_code={ret_code}") + print(f"STDOUT:\n{stdout}") + print(f"STDERR:\n{stderr}") + raise AssertionError("SparkPi job failed on cluster") + else: + print(f"OK: SparkPi on {self.getClusterName()}") + # Add other spark jobs from test_gpu.py if needed + + + def get_instance_names(self): + ret_code, stdout, stderr = self.run_command( + "gcloud compute instances list --filter='name ~ {}-' --format='value(name)' --project={}".format( + self.getClusterName(), self.getProjectId())) + if ret_code != 0: + print(stderr) + raise Exception("Failed to list instances") + return stdout.strip().split('\n') + + def getProjectId(self): + # Assuming gcloud is configured + ret_code, stdout, stderr = self.run_command("gcloud config get-value project") + if ret_code != 0: + print(stderr) + raise Exception("Failed to get project ID") + return stdout.strip() + +def main(): + parser = argparse.ArgumentParser(description='Verify GPU setup on a running Dataproc cluster.') + parser.add_argument('--cluster', default=os.environ.get('CLUSTER_NAME'), help='The name of the Dataproc cluster.') + parser.add_argument('--region', default=os.environ.get('REGION'), help='The region of the cluster.') + parser.add_argument('--zone', default=os.environ.get('ZONE'), help='The zone of the cluster.') + parser.add_argument('--tests', nargs='+', default=['smi', 'agent', 'spark', 'torch'], help='Tests to run: smi, agent, spark, torch') + args = parser.parse_args() + + if not args.cluster: + parser.error("The --cluster argument is required if CLUSTER_NAME environment variable is not set.") + if not args.region: + parser.error("The --region argument is required if REGION environment variable is not set.") + if not args.zone: + parser.error("The --zone argument is required if ZONE environment variable is not set.") + + verifier = VerifyCluster(args.cluster, args.region, args.zone) + verifier.check_cluster_status() + + instance_names = verifier.get_instance_names() + print(f"Found instances: {instance_names}") + + if not instance_names or all(not s for s in instance_names): + print(f"ERROR: No instances found for cluster '{args.cluster}'.") + print(" Please check the following:") + print(" 1. Is the CLUSTER_NAME environment variable or --cluster argument correct?") + print(" 2. Is gcloud authenticated? Run 'gcloud auth list'.") + print(" 3. Is the correct project selected? Run 'gcloud config list'.") + print(" 4. Does the cluster actually exist and is running?") + exit(1) + + for instance_name in instance_names: + if 'smi' in args.tests: + verifier.verify_instance(instance_name) + if 'agent' in args.tests: + verifier.verify_instance_gpu_agent(instance_name) + if 'torch' in args.tests: + verifier.verify_pytorch(instance_name) + # Add other verify functions here + + if 'spark' in args.tests: + verifier.verify_instance_spark() + + print("--- Verification Complete ---") + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..72d2b3391 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +bootstrapping==0.1.2 +click==8.1.7 +packaging +setuptools<70.0.0 \ No newline at end of file