From 7e845dfefca7353dcd0aeaa14fcc06bdb65e2985 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Thu, 22 Jan 2026 13:50:03 -0800 Subject: [PATCH 01/13] preliminary rapids 26.02 updates to pass tests Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 4 ++-- python/README.md | 6 +++--- python/pyproject.toml | 2 +- python/src/spark_rapids_ml/regression.py | 2 -- python/src/spark_rapids_ml/umap.py | 1 + python/src/spark_rapids_ml/utils.py | 4 ++-- python/tests/test_linear_model.py | 6 +----- 7 files changed, 10 insertions(+), 15 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 13d119e9..cdec57d4 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -47,6 +47,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG RAPIDS_VERSION=25.12 -RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ +ARG RAPIDS_VERSION=26.02 +RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/python/README.md b/python/README.md index 266e8fcd..d1ec68c7 100644 --- a/python/README.md +++ b/python/README.md @@ -20,9 +20,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 12.2: ```bash -conda create -n rapids-25.12 \ +conda create -n rapids-26.02 \ -c rapidsai -c conda-forge -c nvidia \ - python=3.10 cuml=25.12 cuvs=25.12 pylibraft=25.12 raft-dask=25.12 cuda-version=12.2 numpy~=1.0 + python=3.10 cuml=26.02 cuvs=26.02 pylibraft=26.02 raft-dask=26.02 cuda-version=12.2 numpy~=1.0 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -31,7 +31,7 @@ conda create -n rapids-25.12 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-25.12 +conda activate rapids-26.02 ## for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/python/pyproject.toml b/python/pyproject.toml index d201a20a..d75c51a3 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -14,7 +14,7 @@ [project] name = "spark-rapids-ml" -version = "25.12.0" +version = "26.2.0" authors = [ { name="Jinfeng Li", email="jinfeng@nvidia.com" }, { name="Bobby Wang", email="bobwang@nvidia.com" }, diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index b630bb36..04165722 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -191,7 +191,6 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]: "maxIter": "max_iter", "regParam": "alpha", "solver": "solver", - "standardization": "normalize", # TODO: standardization is carried out in cupy not cuml so need a new type of param mapped value to indicate that. "tol": "tol", "weightCol": None, } @@ -219,7 +218,6 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "algorithm": "auto", "fit_intercept": True, "copy_X": True, - "normalize": False, "verbose": False, "alpha": 0.0001, "solver": "auto", # in cuml 25.04 default was changed to auto which is mapped to eig internally in cuml diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index d0e8cb15..016fba67 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -134,6 +134,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "verbose": False, "build_algo": "auto", "build_kwds": None, + "device_ids": None, } def _pyspark_class(self) -> Optional[ABCMeta]: diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index f8ac301a..e19070ed 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -222,8 +222,8 @@ def _configure_memory_resource( ) == type(rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)): _old_memory_resources.append(rmm.mr.get_current_device_resource()) _last_sam_headroom_size = sam_headroom - mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) - rmm.mr.set_current_device_resource(mr) + _mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) + rmm.mr.set_current_device_resource(_mr) if uvm_enabled: if not type(rmm.mr.get_current_device_resource()) == type( diff --git a/python/tests/test_linear_model.py b/python/tests/test_linear_model.py index eddfbfd6..c06832fd 100644 --- a/python/tests/test_linear_model.py +++ b/python/tests/test_linear_model.py @@ -115,7 +115,7 @@ def test_params(default_params: bool) -> None: cuml_params = get_default_cuml_parameters( cuml_classes=[CumlLinearRegression, Ridge, CD], - excludes=["handle", "output_type"], + excludes=["handle", "output_type", "normalize"], ) # Ensure internal cuml defaults match actual cuml defaults @@ -126,7 +126,6 @@ def test_params(default_params: bool) -> None: "alpha": spark_params["regParam"], "l1_ratio": spark_params["elasticNetParam"], "max_iter": spark_params["maxIter"], - "normalize": spark_params["standardization"], "tol": spark_params["tol"], } @@ -175,7 +174,6 @@ def test_linear_regression_params( "fit_intercept": True, "l1_ratio": 0.0, "max_iter": 100, - "normalize": True, "solver": "auto", } default_lr = LinearRegression() @@ -196,7 +194,6 @@ def test_linear_regression_params( { "alpha": reg, "fit_intercept": False, - "normalize": False, "solver": "eig", } ) @@ -234,7 +231,6 @@ def test_linear_regression_copy() -> None: ({"regParam": 0.12}, {"alpha": 0.12}), ({"elasticNetParam": 0.23}, {"l1_ratio": 0.23}), ({"fitIntercept": False}, {"fit_intercept": False}), - ({"standardization": False}, {"normalize": False}), ({"tol": 0.0132}, {"tol": 0.0132}), ({"verbose": True}, {"verbose": True}), ] From 96077ce3620620a6847dd42a58e016484e616811 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Mon, 30 Mar 2026 09:35:31 -0700 Subject: [PATCH 02/13] updates for 26.04 + claude skill for this update Signed-off-by: Erik Ordentlich --- .claude/skills/update-rapids-version/SKILL.md | 25 +++++++++++++++++++ python/benchmark/test_gen_data.py | 2 +- python/requirements.txt | 2 +- python/src/spark_rapids_ml/classification.py | 3 ++- python/src/spark_rapids_ml/feature.py | 3 ++- python/src/spark_rapids_ml/knn.py | 8 +++++- python/src/spark_rapids_ml/regression.py | 5 +++- python/src/spark_rapids_ml/tree.py | 2 ++ python/src/spark_rapids_ml/umap.py | 1 + python/tests/test_metrics.py | 2 +- python/tests/test_pipeline.py | 2 +- 11 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 .claude/skills/update-rapids-version/SKILL.md diff --git a/.claude/skills/update-rapids-version/SKILL.md b/.claude/skills/update-rapids-version/SKILL.md new file mode 100644 index 00000000..580e0be5 --- /dev/null +++ b/.claude/skills/update-rapids-version/SKILL.md @@ -0,0 +1,25 @@ +--- +name: update-rapids-version +description: Updates python code (e.g. internal api calls) so that tests pass after running in conda environment with updated rapids version. +--- + +You will be running in an already activated conda environment with the update rapids dependencies. + +Make necessary code changes in the `python` directory tree to get the following test script to complete without error: + +```bash +cd python && CUDA_VISIBLE_DEVICES=0 bash run_test.sh +``` + +1. Fix any formatting errors reported by the script. +2. Fix any type-checking errors reported. +3. Fix all other pytest errors reported. + - Note that pytest phase runs through all tests before reporting any errors. This can take a while. + - Most failures will be due to changes to internal apis in cuML that we rely on. + + +Iterate on 1., 2., and 3. until script succeeeds. The script can take a while to complete. + +For 3., when working on individual tests, especially if only a few are failing, it is faster to run only these tests via pytest directly, followed by a final full run. + +You may search the source code in the directory `../cuml` for relevant internal api changes. The branch for the desired version is checked out. diff --git a/python/benchmark/test_gen_data.py b/python/benchmark/test_gen_data.py index 9d29be69..8b3a8124 100644 --- a/python/benchmark/test_gen_data.py +++ b/python/benchmark/test_gen_data.py @@ -396,7 +396,7 @@ def _func_test_make_sparse_regression( for i in range(len(chunk_boundary)): start = 0 if i == 0 else chunk_boundary[i - 1] - dense_count = np.count_nonzero(X_np[:, start : chunk_boundary[i]]) + dense_count = int(np.count_nonzero(X_np[:, start : chunk_boundary[i]])) col_density = density_values[i] chunk_size = col_per_chunk[i] diff --git a/python/requirements.txt b/python/requirements.txt index ba2a9b48..9aa2c1a6 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -14,6 +14,6 @@ numpy_allocator psutil -pyspark>=3.2.1,<3.5 +pyspark>=3.2.1,<4.0 scikit-learn>=1.2.1 cryptography==46.0.6 diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index 8e6ebcd6..b2a26bf2 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -1481,6 +1481,7 @@ def _construct_lr() -> CumlT: import cupy as cp import numpy as np from cuml.linear_model.logistic_regression_mg import LogisticRegressionMG + from pylibraft.common import Handle from .utils import cudf_to_cuml_array @@ -1490,7 +1491,7 @@ def _construct_lr() -> CumlT: lrs = [] for i in range(num_models): - lr = LogisticRegressionMG(output_type="cupy") + lr = LogisticRegressionMG(handle=Handle(), output_type="cupy") lr.n_features_in_ = n_cols lr.n_cols = n_cols diff --git a/python/src/spark_rapids_ml/feature.py b/python/src/spark_rapids_ml/feature.py index 595a9e85..efb620ce 100644 --- a/python/src/spark_rapids_ml/feature.py +++ b/python/src/spark_rapids_ml/feature.py @@ -411,8 +411,9 @@ def _construct_pca() -> CumlT: """ from cuml.decomposition.pca_mg import PCAMG as CumlPCAMG + from pylibraft.common import Handle - pca = CumlPCAMG(output_type="numpy", **cuml_alg_params) + pca = CumlPCAMG(handle=Handle(), output_type="numpy", **cuml_alg_params) pca.n_features_in_ = n_cols diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 347de9c1..041bfd87 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -79,7 +79,12 @@ def _param_mapping(cls) -> Dict[str, Optional[str]]: return {"k": "n_neighbors"} def _get_cuml_params_default(self) -> Dict[str, Any]: - return {"n_neighbors": 5, "verbose": False, "batch_size": 2000000} + return { + "n_neighbors": 5, + "verbose": False, + "batch_size": 2000000, + "radius": 1.0, + } def _pyspark_class(self) -> Optional[ABCMeta]: return None @@ -853,6 +858,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "algorithm": "ivfflat", "metric": "euclidean", "algo_params": None, + "radius": 1.0, } def _pyspark_class(self) -> Optional[ABCMeta]: diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index 04165722..5dc5e130 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -787,6 +787,7 @@ def _get_cuml_transform_func( def _construct_lr() -> CumlT: from cuml.linear_model.linear_regression_mg import LinearRegressionMG + from pylibraft.common import Handle from .utils import cudf_to_cuml_array @@ -796,7 +797,9 @@ def _construct_lr() -> CumlT: intercepts = intercept_ if isinstance(intercept_, list) else [intercept_] for i in range(len(coefs)): - lr = LinearRegressionMG(output_type="numpy", copy_X=False) + lr = LinearRegressionMG( + handle=Handle(), output_type="numpy", copy_X=False + ) # need this to revert a change in cuML targeting sklearn compat. lr.n_features_in_ = n_cols lr.n_cols = n_cols diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index bb418ce7..529d8177 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -678,6 +678,7 @@ def _get_cuml_transform_func( is_classification = self._is_classification() dtype = self.dtype num_classes = self._num_classes + n_cols = self.n_cols def _construct_rf() -> CumlT: if is_classification: @@ -698,6 +699,7 @@ def _construct_rf() -> CumlT: rf = cuRf() rf.n_classes_ = num_classes rf.classes_ = np.arange(num_classes, dtype=np.int32) + rf.n_features_in_ = n_cols rf._treelite_model_bytes = treelite.Model.deserialize_bytes(model) rfs.append(rf) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index 016fba67..a7482a06 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -135,6 +135,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "build_algo": "auto", "build_kwds": None, "device_ids": None, + "force_serial_epochs": False, } def _pyspark_class(self) -> Optional[ABCMeta]: diff --git a/python/tests/test_metrics.py b/python/tests/test_metrics.py index ecaadae0..6c36359d 100644 --- a/python/tests/test_metrics.py +++ b/python/tests/test_metrics.py @@ -96,7 +96,7 @@ def test_multi_class_metrics( ).astype(np.float64) probabilities = np.random.rand(1000, num_classes) - probabilities[range(1000), list(pdf["label"].astype(np.integer))] = 2.0 + probabilities[range(1000), list(pdf["label"].astype(int))] = 2.0 probabilities = probabilities / np.sum(probabilities, axis=1).reshape(-1, 1) pdf["probabilities"] = list(probabilities) diff --git a/python/tests/test_pipeline.py b/python/tests/test_pipeline.py index 15fe446c..1caeb94b 100644 --- a/python/tests/test_pipeline.py +++ b/python/tests/test_pipeline.py @@ -420,7 +420,7 @@ def test_compat_random_forest( Pipeline, UMAP, UMAPModel, - {"n_components": 1}, + {"n_components": 2}, ), ( Pipeline, From 6d573b99ff6e7f386f63f8e43b2745c7e03fa3f2 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Mon, 30 Mar 2026 09:42:53 -0700 Subject: [PATCH 03/13] more 26.04 updates Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- python/README.md | 4 ++-- python/benchmark/databricks/run_benchmark.sh | 8 ++++---- python/pyproject.toml | 10 ++++------ 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index cdec57d4..2b2594b8 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -47,6 +47,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG RAPIDS_VERSION=26.02 +ARG RAPIDS_VERSION=26.04 RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/python/README.md b/python/README.md index d1ec68c7..9e967fcb 100644 --- a/python/README.md +++ b/python/README.md @@ -20,9 +20,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 12.2: ```bash -conda create -n rapids-26.02 \ +conda create -n rapids-26.04 \ -c rapidsai -c conda-forge -c nvidia \ - python=3.10 cuml=26.02 cuvs=26.02 pylibraft=26.02 raft-dask=26.02 cuda-version=12.2 numpy~=1.0 + python=3.11 cuml=26.04 cuvs=26.04 pylibraft=26.04 raft-dask=26.04 cuda-version=12.2 numpy~=1.0 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. diff --git a/python/benchmark/databricks/run_benchmark.sh b/python/benchmark/databricks/run_benchmark.sh index 7920d511..c7eee3f9 100755 --- a/python/benchmark/databricks/run_benchmark.sh +++ b/python/benchmark/databricks/run_benchmark.sh @@ -15,7 +15,7 @@ cluster_type=${1:-gpu_etl} -db_version=${2:-13.3} +db_version=${2:-15.4} if [[ $cluster_type == "gpu" || $cluster_type == "gpu_etl" ]]; then num_cpus=0 @@ -25,13 +25,13 @@ elif [[ $cluster_type == "cpu" ]]; then num_gpus=0 else echo "unknown cluster type $cluster_type" - echo "usage: $0 cpu|gpu|gpu_etl [12.2|13.3|14.3|15.4]" + echo "usage: $0 cpu|gpu|gpu_etl [15.4]" exit 1 fi -if [[ $db_version > 13.3 && $cluster_type == "gpu_etl" ]]; then +if [[ $db_version > 16.4 && $cluster_type == "gpu_etl" ]]; then echo "spark rapids etl plugin is not supported on databricks ${db_version}" - echo "please specify db_version 12.2 or 13.3 for cluster type gpu_etl" + echo "please specify db_version 15.4 or 16.4 for cluster type gpu_etl" exit 1 fi diff --git a/python/pyproject.toml b/python/pyproject.toml index d75c51a3..1fdcd8e4 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ [project] name = "spark-rapids-ml" -version = "26.2.0" +version = "26.4.0" authors = [ { name="Jinfeng Li", email="jinfeng@nvidia.com" }, { name="Bobby Wang", email="bobwang@nvidia.com" }, @@ -23,16 +23,14 @@ authors = [ ] description = "Apache Spark integration with RAPIDS and cuML" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.11" classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Environment :: GPU :: NVIDIA CUDA :: 12", - "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.0", - "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1", "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.2", "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.3", "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4", From dc518ac58eb562a97d37575af96996819c778b2e Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Thu, 28 May 2026 15:24:37 -0700 Subject: [PATCH 04/13] updates for rapids 26.06 Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- docker/Dockerfile.pip | 2 +- docker/Dockerfile.python | 2 +- python/README.md | 6 +++--- python/pyproject.toml | 2 +- python/src/spark_rapids_ml/__init__.py | 2 +- python/src/spark_rapids_ml/tree.py | 5 ++--- python/src/spark_rapids_ml/umap.py | 7 ++++--- python/src/spark_rapids_ml/utils.py | 2 +- python/tests/test_random_forest.py | 6 +++++- python/tests/test_umap.py | 9 +++++++-- 11 files changed, 27 insertions(+), 18 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 2b2594b8..306fca74 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -47,6 +47,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG RAPIDS_VERSION=26.04 +ARG RAPIDS_VERSION=26.06 RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index ac9cd292..58c6e5de 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -18,7 +18,7 @@ ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG PYSPARK_VERSION=3.3.1 -ARG RAPIDS_VERSION=25.12.0 +ARG RAPIDS_VERSION=26.06.0 ARG ARCH=amd64 #ARG ARCH=arm64 diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 0306459a..44bf5d40 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -17,7 +17,7 @@ ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -ARG RAPIDS_VERSION=25.12 +ARG RAPIDS_VERSION=26.06 # ubuntu22 RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ diff --git a/python/README.md b/python/README.md index 9e967fcb..b2fe8e34 100644 --- a/python/README.md +++ b/python/README.md @@ -20,9 +20,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 12.2: ```bash -conda create -n rapids-26.04 \ +conda create -n rapids-26.06 \ -c rapidsai -c conda-forge -c nvidia \ - python=3.11 cuml=26.04 cuvs=26.04 pylibraft=26.04 raft-dask=26.04 cuda-version=12.2 numpy~=1.0 + python=3.11 cuml=26.06 cuvs=26.06 pylibraft=26.06 raft-dask=26.06 cuda-version=12.2 numpy~=1.0 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -31,7 +31,7 @@ conda create -n rapids-26.04 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-26.02 +conda activate rapids-26.06 ## for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/python/pyproject.toml b/python/pyproject.toml index 1fdcd8e4..3c418d85 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -14,7 +14,7 @@ [project] name = "spark-rapids-ml" -version = "26.4.0" +version = "26.6.0" authors = [ { name="Jinfeng Li", email="jinfeng@nvidia.com" }, { name="Bobby Wang", email="bobwang@nvidia.com" }, diff --git a/python/src/spark_rapids_ml/__init__.py b/python/src/spark_rapids_ml/__init__.py index ea4c268b..e9b6bf10 100644 --- a/python/src/spark_rapids_ml/__init__.py +++ b/python/src/spark_rapids_ml/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "25.12.0" +__version__ = "26.6.0" import pandas as pd import pyspark diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index 529d8177..d4693918 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -138,7 +138,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: return { "n_streams": 4, "n_estimators": 100, - "max_depth": 16, + "max_depth": "deprecated", "max_features": "sqrt", # for classification, should be 1.0 for regressor, cuml is a little broken here "n_bins": 128, "bootstrap": True, @@ -688,7 +688,6 @@ def _construct_rf() -> CumlT: import cupy as cp import numpy as np - import treelite rfs = [] treelite_models = ( @@ -700,7 +699,7 @@ def _construct_rf() -> CumlT: rf.n_classes_ = num_classes rf.classes_ = np.arange(num_classes, dtype=np.int32) rf.n_features_in_ = n_cols - rf._treelite_model_bytes = treelite.Model.deserialize_bytes(model) + rf._treelite_model_bytes = model rfs.append(rf) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index a7482a06..d20b550e 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -29,6 +29,7 @@ Tuple, Type, Union, + cast, ) import numpy as np @@ -135,7 +136,7 @@ def _get_cuml_params_default(self) -> Dict[str, Any]: "build_algo": "auto", "build_kwds": None, "device_ids": None, - "force_serial_epochs": False, + "force_serial_epochs": None, } def _pyspark_class(self) -> Optional[ABCMeta]: @@ -1702,9 +1703,9 @@ def read_sparse_array( def read_dense_array(df_path: str) -> np.ndarray: data_df = spark.read.parquet(df_path).orderBy("row_id") - pdf = data_df.toPandas() + pdf = cast(PandasDataFrame, data_df.toPandas()) assert type(pdf) == pd.DataFrame - return np.array(list(pdf.data), dtype=np.float32) + return np.array(list(pdf["data"]), dtype=np.float32) metadata = DefaultParamsReader.loadMetadata(path, self.sc) data_path = os.path.join(path, "data") diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index e19070ed..945b3d2b 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -175,7 +175,7 @@ def _get_gpu_id(task_context: TaskContext) -> int: # invoke the corresponding deallocate methods. They will get cleaned up only when # the process exits. This avoids a segfault in the case of creating a new # SAM resource with a smaller headroom. -_old_memory_resources = [] +_old_memory_resources: List[Any] = [] # keep track of last headroom to check if new sam mr is needed. _last_sam_headroom_size = None diff --git a/python/tests/test_random_forest.py b/python/tests/test_random_forest.py index ff4c877e..5ab2f45d 100644 --- a/python/tests/test_random_forest.py +++ b/python/tests/test_random_forest.py @@ -331,7 +331,10 @@ def test_random_forest_basic( est.setLabelCol(label_col) assert est.getLabelCol() == label_col - def assert_model(lhs: RandomForestModel, rhs: RandomForestModel) -> None: + def assert_model( + lhs: Union[RandomForestClassificationModel, RandomForestRegressionModel], + rhs: Union[RandomForestClassificationModel, RandomForestRegressionModel], + ) -> None: assert lhs.cuml_params == rhs.cuml_params # Vector and array(double) type will be cast to array(float) by default @@ -342,6 +345,7 @@ def assert_model(lhs: RandomForestModel, rhs: RandomForestModel) -> None: assert lhs.n_cols == data_shape[1] if isinstance(lhs, RandomForestClassificationModel): + assert isinstance(rhs, RandomForestClassificationModel) assert lhs.numClasses == rhs.numClasses assert lhs.numClasses == n_classes diff --git a/python/tests/test_umap.py b/python/tests/test_umap.py index cc447d99..fdf0c84c 100644 --- a/python/tests/test_umap.py +++ b/python/tests/test_umap.py @@ -67,10 +67,15 @@ def _load_sparse_data( if normalize: row_sums = np.array(csr_mat.sum(axis=1)).flatten() - row_sums[row_sums == 0] = 1.0 + zero_rows = np.flatnonzero(row_sums == 0) + if len(zero_rows) > 0: + csr_mat = csr_mat.tolil() + csr_mat[zero_rows, 0] = 1.0 + csr_mat = csr_mat.tocsr() + row_sums = np.array(csr_mat.sum(axis=1)).flatten() row_sum_diag = scipy.sparse.diags(1.0 / row_sums) csr_mat = row_sum_diag @ csr_mat - assert np.allclose(np.array(csr_mat.sum(axis=1)).flatten(), 1.0) + assert np.allclose(np.array(csr_mat.sum(axis=1)).flatten(), 1.0, atol=1e-6) # Convert CSR matrix to SparseVectors data = [] From ba7f67eefb577e1b6c61bb17fed0b8baea7f97ee Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Thu, 28 May 2026 15:34:51 -0700 Subject: [PATCH 05/13] drop spark 3.3 test, as rapids minimum python is 3.11 which is not compatible with pyspark 3.3 Signed-off-by: Erik Ordentlich --- ci/test.sh | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/ci/test.sh b/ci/test.sh index 05c80a88..94d46f1b 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -47,15 +47,3 @@ pip install -r requirements_dev.txt && pip install -e . # plugin tests ./run_plugin_test.sh -# check compatibility with Spark 3.3 in nightly run -# also push draft release docs to gh-pages -if [[ $type == "nightly" ]]; then - pip uninstall pyspark -y - pip install pyspark~=3.3.0 - ./run_test.sh - ./run_benchmark.sh $bench_args - # if everything passed till now update draft release docs in gh-pages - # need to invoke docs.sh from top level of repo - cd .. # top level of repo - ci/docs.sh nightly -fi From 3bed382d327042081ee6c633206442ab8d322242 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Sat, 30 May 2026 10:12:52 -0700 Subject: [PATCH 06/13] update databricks benchmark scripts Signed-off-by: Erik Ordentlich --- .../databricks/gpu_etl_cluster_spec.sh | 2 +- .../benchmark/databricks/init-pip-cuda-12.sh | 23 ++++++++++++------- python/benchmark/databricks/run_benchmark.sh | 13 ++++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/python/benchmark/databricks/gpu_etl_cluster_spec.sh b/python/benchmark/databricks/gpu_etl_cluster_spec.sh index ab31b70e..4a5d7b40 100644 --- a/python/benchmark/databricks/gpu_etl_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_etl_cluster_spec.sh @@ -24,7 +24,7 @@ cat < 16.4 && $cluster_type == "gpu_etl" ]]; then +if [[ $db_version != 17.3 && $cluster_type == "gpu_etl" ]]; then echo "spark rapids etl plugin is not supported on databricks ${db_version}" - echo "please specify db_version 15.4 or 16.4 for cluster type gpu_etl" + echo "please specify db_version 17.3 for cluster type gpu_etl" exit 1 fi +if [[ $db_version > 16.4 ]]; then + SCALA_VERSION=2.13 +fi + + source benchmark_utils.sh BENCHMARK_DATA_HOME=s3a://spark-rapids-ml-bm-datasets-public From b2982ae5ad5478a72717cbfc55a2a6235b914e52 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Wed, 3 Jun 2026 13:29:14 -0700 Subject: [PATCH 07/13] update copyright years Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- ci/test.sh | 2 +- docker/Dockerfile.pip | 2 +- docker/Dockerfile.python | 2 +- python/benchmark/databricks/gpu_etl_cluster_spec.sh | 2 +- python/benchmark/databricks/init-pip-cuda-12.sh | 2 +- python/benchmark/databricks/run_benchmark.sh | 2 +- python/src/spark_rapids_ml/__init__.py | 2 +- python/src/spark_rapids_ml/classification.py | 2 +- python/src/spark_rapids_ml/feature.py | 2 +- python/src/spark_rapids_ml/knn.py | 2 +- python/src/spark_rapids_ml/regression.py | 2 +- python/src/spark_rapids_ml/tree.py | 2 +- python/src/spark_rapids_ml/umap.py | 2 +- python/src/spark_rapids_ml/utils.py | 2 +- python/tests/test_linear_model.py | 2 +- python/tests/test_metrics.py | 2 +- python/tests/test_pipeline.py | 2 +- python/tests/test_random_forest.py | 2 +- python/tests/test_umap.py | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 306fca74..48f3ee82 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/ci/test.sh b/ci/test.sh index 94d46f1b..862f81f4 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 58c6e5de..509b81eb 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 44bf5d40..c46809d6 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/benchmark/databricks/gpu_etl_cluster_spec.sh b/python/benchmark/databricks/gpu_etl_cluster_spec.sh index 4a5d7b40..56e52de6 100644 --- a/python/benchmark/databricks/gpu_etl_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_etl_cluster_spec.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/benchmark/databricks/init-pip-cuda-12.sh b/python/benchmark/databricks/init-pip-cuda-12.sh index 8996f595..b8778e69 100644 --- a/python/benchmark/databricks/init-pip-cuda-12.sh +++ b/python/benchmark/databricks/init-pip-cuda-12.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/benchmark/databricks/run_benchmark.sh b/python/benchmark/databricks/run_benchmark.sh index 737c3acf..8d55490b 100755 --- a/python/benchmark/databricks/run_benchmark.sh +++ b/python/benchmark/databricks/run_benchmark.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/__init__.py b/python/src/spark_rapids_ml/__init__.py index e9b6bf10..ca5d2f8a 100644 --- a/python/src/spark_rapids_ml/__init__.py +++ b/python/src/spark_rapids_ml/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index b2a26bf2..eed109cb 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/feature.py b/python/src/spark_rapids_ml/feature.py index efb620ce..71ef0481 100644 --- a/python/src/spark_rapids_ml/feature.py +++ b/python/src/spark_rapids_ml/feature.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index 041bfd87..adc2e32b 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index 5dc5e130..da5e5394 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index d4693918..13f5830a 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index d20b550e..03cce339 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 945b3d2b..a236f9ba 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/tests/test_linear_model.py b/python/tests/test_linear_model.py index c06832fd..6a51890d 100644 --- a/python/tests/test_linear_model.py +++ b/python/tests/test_linear_model.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2025, NVIDIA CORPORATION. +# Copyright (c) 2022-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/tests/test_metrics.py b/python/tests/test_metrics.py index 6c36359d..0ac2f8e9 100644 --- a/python/tests/test_metrics.py +++ b/python/tests/test_metrics.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/tests/test_pipeline.py b/python/tests/test_pipeline.py index 1caeb94b..215c6311 100644 --- a/python/tests/test_pipeline.py +++ b/python/tests/test_pipeline.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/tests/test_random_forest.py b/python/tests/test_random_forest.py index 5ab2f45d..088d3b90 100644 --- a/python/tests/test_random_forest.py +++ b/python/tests/test_random_forest.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/tests/test_umap.py b/python/tests/test_umap.py index fdf0c84c..e07ca1d7 100644 --- a/python/tests/test_umap.py +++ b/python/tests/test_umap.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From ab5a9cb31920de121a2a244386e39bef95836172 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Wed, 3 Jun 2026 14:28:18 -0700 Subject: [PATCH 08/13] bumpy python version in ci Docker Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 48f3ee82..1de8ca46 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -48,5 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=26.06 -RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ +RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.11 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y From 624ebcbad7e10bd6d509d6d16d5f22907a2ac8ac Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Wed, 3 Jun 2026 14:32:07 -0700 Subject: [PATCH 09/13] add some TODOs to track official 26.06 rapids release Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 1 + notebooks/databricks/init-pip-cuda-12.sh | 1 + python/benchmark/databricks/init-pip-cuda-12.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/ci/Dockerfile b/ci/Dockerfile index 1de8ca46..f49dc25b 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -48,5 +48,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=26.06 +# TODO change to rapidsai after rapids 26.06 is released RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.11 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/notebooks/databricks/init-pip-cuda-12.sh b/notebooks/databricks/init-pip-cuda-12.sh index f53bb590..1c053dfd 100644 --- a/notebooks/databricks/init-pip-cuda-12.sh +++ b/notebooks/databricks/init-pip-cuda-12.sh @@ -22,6 +22,7 @@ set -ex # Note that the SPARK_RAPIDS_VERSION will not necessarily match the RAPIDS_VERSION. Check https://nvidia.github.io/spark-rapids/docs/download.html for the latest compatible version of # spark-rapids version that verifies compatibility with your Databricks Runtime. (In this case, Databricks 17.3 ML LTS.) The available versions for RAPIDS_VERSION can be # found by executing "pip index versions spark-rapids-ml". +# TODO change RAPIDS_VERSION to 26.6.0 after rapids 26.06 is released RAPIDS_VERSION=25.12.0 SPARK_RAPIDS_VERSION=26.04.2 diff --git a/python/benchmark/databricks/init-pip-cuda-12.sh b/python/benchmark/databricks/init-pip-cuda-12.sh index b8778e69..850fe7f9 100644 --- a/python/benchmark/databricks/init-pip-cuda-12.sh +++ b/python/benchmark/databricks/init-pip-cuda-12.sh @@ -19,6 +19,7 @@ BENCHMARK_ZIP=/dbfs/path/to/benchmark.zip # IMPORTANT: specify rapids fully 23.10.0 and not 23.10 # also, in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) +# TODO change RAPIDS_VERSION to 26.6.0 after rapids 26.06 is released RAPIDS_VERSION=25.12.0 SPARK_RAPIDS_VERSION=26.04.2 From 71c8197c26a3e5e0572758c2021169be9c40b172 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Fri, 12 Jun 2026 23:36:38 -0700 Subject: [PATCH 10/13] updates to align with official 26.06 rapids release. update emr and dataproc Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 3 +-- ci/test.sh | 8 +++++++ docs/source/conf.py | 6 ++--- jvm/README.md | 8 +++---- jvm/pom.xml | 4 ++-- notebooks/aws-emr/init-bootstrap-action.sh | 22 +++++++++---------- notebooks/aws-emr/init-configurations.json | 6 ++--- notebooks/databricks/init-pip-cuda-12.sh | 3 +-- notebooks/dataproc/README.md | 2 +- notebooks/dataproc/spark_rapids_ml.sh | 4 ++-- python/benchmark/aws-emr/run_benchmark.sh | 12 +++++----- .../benchmark/databricks/init-pip-cuda-12.sh | 3 +-- python/benchmark/dataproc/init_benchmark.sh | 4 ++-- python/benchmark/dataproc/run_benchmark.sh | 16 ++++++++------ python/run_benchmark.sh | 4 ++-- 15 files changed, 57 insertions(+), 48 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index f49dc25b..f838721a 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -48,6 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=26.06 -# TODO change to rapidsai after rapids 26.06 is released -RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.11 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.11 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/ci/test.sh b/ci/test.sh index 862f81f4..a2816091 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -47,3 +47,11 @@ pip install -r requirements_dev.txt && pip install -e . # plugin tests ./run_plugin_test.sh +# push draft release docs to gh-pages in nightly run +if [[ $type == "nightly" ]]; then + # if everything passed till now update draft release docs in gh-pages + # need to invoke docs.sh from top level of repo + cd .. # top level of repo + ci/docs.sh nightly +fi + diff --git a/docs/source/conf.py b/docs/source/conf.py index c7db1bf1..b284ea8a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'spark-rapids-ml' -copyright = '2025, NVIDIA' +copyright = '2026, NVIDIA' author = 'NVIDIA' -release = '25.12.0' +release = '26.06.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/jvm/README.md b/jvm/README.md index 1eb7d92f..ad9581e7 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -31,7 +31,7 @@ JDK 17, Spark 4.0 ```shell # Create a new conda environment for the client - conda create -n pyspark-client python==3.10 + conda create -n pyspark-client python==3.11 conda activate pyspark-client # Install the PySpark client package @@ -50,10 +50,10 @@ including setting up the server and running client-side tests. To start the Spark Connect server with Spark Rapids ML support, follow these steps: ```shell -conda activate rapids-25.12 # from spark-rapids-ml installation +conda activate rapids-26.06 # from spark-rapids-ml installation export SPARK_HOME= export PYSPARK_PYTHON=$(which python) -export PLUGIN_JAR=$(pip show spark-rapids-ml | grep Location: | cut -d ' ' -f 2 )/spark_rapids_ml/jars/com.nvidia.rapids.ml-25.12.0.jar +export PLUGIN_JAR=$(pip show spark-rapids-ml | grep Location: | cut -d ' ' -f 2 )/spark_rapids_ml/jars/com.nvidia.rapids.ml-26.06.0.jar $SPARK_HOME/sbin/start-connect-server.sh --master local[*] \ --jars $PLUGIN_JAR \ --conf spark.driver.memory=20G @@ -107,7 +107,7 @@ mvn clean package -DskipTests if you would like to compile the plugin and run the unit tests, install `spark-rapids-ml` python package and its dependencies per the above instructions and run the following command: ``` shell -conda activate rapids-25.12 +conda activate rapids-26.06 export PYSPARK_PYTHON=$(which python) mvn clean package ``` diff --git a/jvm/pom.xml b/jvm/pom.xml index 09aae8bf..9b1e45a6 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -1,6 +1,6 @@