diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index c1efb44b..40735ca5 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -14,11 +14,10 @@ If you already have a Databricks account, you can run the example notebooks on a databricks workspace import --format AUTO --file init-pip-cuda-12.sh ${WS_SAVE_DIR}/init-pip-cuda-12.sh --profile ${PROFILE} ``` **Note**: the init script does the following on each Spark node: - - updates the CUDA runtime (required for Spark Rapids ML dependencies). - downloads and installs the [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) plugin for accelerating data loading and Spark SQL. - installs various `cuXX` dependencies via pip. - if the cluster environment variable `SPARK_RAPIDS_ML_NO_IMPORT_ENABLED=1` is define (see below), the init script also modifies a Databricks notebook kernel startup script to enable no-import change UX for the cluster. See [no-import-change](../README.md#no-import-change). -- Create a cluster using **Databricks 13.3 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**. +- Create a cluster using **Databricks 17.3 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**. - **Init Scripts** - add the workspace path to the uploaded init script `${WS_SAVE_DIR}/init-pip-cuda-12.sh` as set above (but substitute variables manually in the form). - **Spark** @@ -27,7 +26,7 @@ If you already have a Databricks account, you can run the example notebooks on a spark.task.resource.gpu.amount 0.125 spark.databricks.delta.preview.enabled true spark.python.worker.reuse true - spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-25.12.0.jar:/databricks/spark/python + spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.13-26.04.2.jar:/databricks/spark/python spark.sql.execution.arrow.maxRecordsPerBatch 100000 spark.plugins com.nvidia.spark.SQLPlugin spark.locality.wait 0s diff --git a/notebooks/databricks/init-pip-cuda-12.sh b/notebooks/databricks/init-pip-cuda-12.sh index a6fc0fd2..f53bb590 100644 --- a/notebooks/databricks/init-pip-cuda-12.sh +++ b/notebooks/databricks/init-pip-cuda-12.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,21 +15,17 @@ set -ex -# IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 -# also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) -# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) +# IMPORTANT: specify RAPIDS_VERSION fully 26.4.0 and not 26.4 +# also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 26.4.0 and not 26.04.0) +# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 26.04.2 and not 26.4.2) +# +# Note that the SPARK_RAPIDS_VERSION will not necessarily match the RAPIDS_VERSION. Check https://nvidia.github.io/spark-rapids/docs/download.html for the latest compatible version of +# spark-rapids version that verifies compatibility with your Databricks Runtime. (In this case, Databricks 17.3 ML LTS.) The available versions for RAPIDS_VERSION can be +# found by executing "pip index versions spark-rapids-ml". RAPIDS_VERSION=25.12.0 -SPARK_RAPIDS_VERSION=25.12.0 +SPARK_RAPIDS_VERSION=26.04.2 -curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda12.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar - -# install cudatoolkit 12.2 via runfile approach -wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run -sh cuda_12.2.2_535.104.05_linux.run --silent --toolkit - -# reset symlink and update library loading paths -rm /usr/local/cuda -ln -s /usr/local/cuda-12.2 /usr/local/cuda +curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.13-${SPARK_RAPIDS_VERSION}-cuda12.jar -o /databricks/jars/rapids-4-spark_2.13-${SPARK_RAPIDS_VERSION}.jar # upgrade pip /databricks/python/bin/pip install --upgrade pip