diff --git a/.claude/skills/update-rapids-version/SKILL.md b/.claude/skills/update-rapids-version/SKILL.md new file mode 100644 index 00000000..580e0be5 --- /dev/null +++ b/.claude/skills/update-rapids-version/SKILL.md @@ -0,0 +1,25 @@ +--- +name: update-rapids-version +description: Updates python code (e.g. internal api calls) so that tests pass after running in conda environment with updated rapids version. +--- + +You will be running in an already activated conda environment with the update rapids dependencies. + +Make necessary code changes in the `python` directory tree to get the following test script to complete without error: + +```bash +cd python && CUDA_VISIBLE_DEVICES=0 bash run_test.sh +``` + +1. Fix any formatting errors reported by the script. +2. Fix any type-checking errors reported. +3. Fix all other pytest errors reported. + - Note that pytest phase runs through all tests before reporting any errors. This can take a while. + - Most failures will be due to changes to internal apis in cuML that we rely on. + + +Iterate on 1., 2., and 3. until script succeeeds. The script can take a while to complete. + +For 3., when working on individual tests, especially if only a few are failing, it is faster to run only these tests via pytest directly, followed by a final full run. + +You may search the source code in the directory `../cuml` for relevant internal api changes. The branch for the desired version is checked out. diff --git a/ci/Dockerfile b/ci/Dockerfile index 13d119e9..f838721a 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,6 +47,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG RAPIDS_VERSION=25.12 -RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ +ARG RAPIDS_VERSION=26.06 +RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.11 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/ci/test.sh b/ci/test.sh index 05c80a88..a2816091 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -47,15 +47,11 @@ pip install -r requirements_dev.txt && pip install -e . # plugin tests ./run_plugin_test.sh -# check compatibility with Spark 3.3 in nightly run -# also push draft release docs to gh-pages +# push draft release docs to gh-pages in nightly run if [[ $type == "nightly" ]]; then - pip uninstall pyspark -y - pip install pyspark~=3.3.0 - ./run_test.sh - ./run_benchmark.sh $bench_args # if everything passed till now update draft release docs in gh-pages # need to invoke docs.sh from top level of repo cd .. # top level of repo ci/docs.sh nightly fi + diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index ac9cd292..f435ae33 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,8 +17,8 @@ ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -ARG PYSPARK_VERSION=3.3.1 -ARG RAPIDS_VERSION=25.12.0 +ARG PYSPARK_VERSION=3.4.4 +ARG RAPIDS_VERSION=26.06.0 ARG ARCH=amd64 #ARG ARCH=arm64 @@ -35,13 +35,27 @@ RUN apt-get update -y \ && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-17-jdk \ && rm -rf /var/lib/apt/lists +# some of the below needed for python, installed from source below, to have full functionality RUN apt-get update -y \ - && apt install -y git numactl python3.10-venv python3-pip python-is-python3 software-properties-common wget zip \ - && python -m pip install --upgrade pip \ - && rm -rf /var/lib/apt/lists + && apt install -y git numactl software-properties-common wget zip build-essential zlib1g-dev \ + libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev libbz2-dev + +# install python 3.11 as base image has python 3.10 +RUN bash -c "wget https://www.python.org/ftp/python/3.11.9/Python-3.11.9.tgz && \ + tar xzf Python-3.11.9.tgz && cd Python-3.11.9 && \ + ./configure --enable-optimizations && make altinstall" + +# 1. Define the venv path and update system PATH +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# 2. Create the virtual environment +RUN python3.11 -m venv $VIRTUAL_ENV + +RUN python -m pip install --upgrade pip RUN apt-get update -y \ - && apt install -y python3.10-dev cmake curl \ + && apt install -y cmake curl \ && rm -rf /var/lib/apt/lists # install RAPIDS @@ -55,13 +69,9 @@ RUN pip install --no-cache-dir \ numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com -# install python dependencies -RUN pip install --no-cache-dir pyspark==${PYSPARK_VERSION} "scikit-learn>=1.2.1" \ - && pip install --no-cache-dir "black>=23.1.0" "build>=0.10.0" "isort>=5.12.0" "mypy>=1.0.0" \ - numpydoc pydata-sphinx-theme pylint pytest "sphinx<6.0" "twine>=4.0.0" # Config JAVA_HOME -ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-$ARCH +ENV JAVA_HOME=/usr/lib/jvm/java-1.17.0-openjdk-$ARCH ### END OF CACHE ### diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 0306459a..b84fef35 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -1,5 +1,5 @@ # -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -ARG RAPIDS_VERSION=25.12 +ARG RAPIDS_VERSION=26.06 # ubuntu22 RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ @@ -34,28 +34,24 @@ RUN apt update -y \ && rm -rf /var/lib/apt/lists # Config JAVA_HOME -ENV JAVA_HOME /usr/lib/jvm/java-1.17.0-openjdk-amd64 +ENV JAVA_HOME=/usr/lib/jvm/java-1.17.0-openjdk-amd64 # Install conda ENV PATH="/root/miniconda3/bin:${PATH}" -RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh \ +ENV CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes" +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ && mkdir /root/.conda \ - && bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b \ - && rm -f Miniconda3-py38_4.10.3-Linux-x86_64.sh \ - && conda tos accept --override-channels -c conda-forge -c defaults \ - && conda init + && bash Miniconda3-latest-Linux-x86_64.sh -b \ + && rm -f Miniconda3-latest-Linux-x86_64.sh \ + && conda init && conda update -n base conda \ + && conda install -n base conda-libmamba-solver \ + && conda config --set solver libmamba # install cuML -RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.2 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.11 cuda-version=12.2 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION numpy~=1.0 \ && conda clean --all -f -y -# install python dependencies -RUN pip install --no-cache-dir "pyspark>=3.2.1" "scikit-learn>=1.2.1" \ - && pip install --no-cache-dir "black>=23.1.0" "build>=0.10.0" "isort>=5.12.0" "mypy>=1.0.0" \ - numpydoc pydata-sphinx-theme pylint pytest "sphinx<6.0" "twine>=4.0.0" - -### END OF CACHE ### #ARG RAPIDS_ML_VER=main #RUN git clone -b branch-$RAPIDS_ML_VER https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/docs/site/FAQ.md b/docs/site/FAQ.md index 53b2dd31..b83dbeb6 100644 --- a/docs/site/FAQ.md +++ b/docs/site/FAQ.md @@ -9,11 +9,11 @@ nav_order: 4 ### What versions of Apache Spark are supported? -Apache Spark version 3.3.1 or higher. +Apache Spark version 3.4 or higher. ### What versions of Python are supported -Python 3.10 or higher. +Python 3.11 or higher. ### How do I fix the "java.lang.IllegalArgumentException: valueCount must be >= 0" error? diff --git a/docs/site/compatibility.md b/docs/site/compatibility.md index 3432d4d7..f640b1b1 100644 --- a/docs/site/compatibility.md +++ b/docs/site/compatibility.md @@ -31,7 +31,7 @@ The following table shows the currently supported algorithms. The goal is to ex | Spark Rapids ML | CUDA | Spark | Python | | :-------------- | :---- | :----- | :----- | -| 1.0.0 | 12.0+ | 3.3+ | 3.10+ | +| 26.6.0 | 12.2+ | 3.4+ | 3.11+ | ## Single vs Double precision inputs diff --git a/docs/site/performance.md b/docs/site/performance.md index 4804f8c6..41494c7f 100644 --- a/docs/site/performance.md +++ b/docs/site/performance.md @@ -10,7 +10,7 @@ nav_order: 6 ## Stage-level scheduling Starting from spark-rapids-ml `23.10.0`, stage-level scheduling is automatically enabled. -Therefore, if you are using Spark **standalone** cluster version **`3.4.0`** or higher, we strongly recommend +Therefore, if you are using Spark **standalone** cluster version **`3.4`** or higher, we strongly recommend configuring the `"spark.task.resource.gpu.amount"` as a fractional value. This will enable running multiple tasks in parallel during the ETL phase to help the performance. An example configuration would be `"spark.task.resource.gpu.amount=1/spark.executor.cores"`. For example, @@ -30,7 +30,7 @@ a total of 12 tasks per executor will be executed concurrently during the ETL ph is then used internally to the library to automatically carry out the ML training phases using the required 1 gpu per task. However, if you are using a spark-rapids-ml version earlier than 23.10.0 or a Spark -standalone cluster version below 3.4.0, you need to make sure there will be only 1 task running at any time per executor. +standalone cluster version below 3.4, you need to make sure there will be only 1 task running at any time per executor. You can set `spark.task.cpus` equal to `spark.executor.cores`, or `"spark.task.resource.gpu.amount"=1`. For example, ``` bash diff --git a/docs/source/conf.py b/docs/source/conf.py index c7db1bf1..e7fd2fe0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. +# Copyright (c) 2025-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,9 +21,9 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'spark-rapids-ml' -copyright = '2025, NVIDIA' +copyright = '2025-2026, NVIDIA' author = 'NVIDIA' -release = '25.12.0' +release = '26.06.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/jvm/README.md b/jvm/README.md index 1eb7d92f..ad9581e7 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -31,7 +31,7 @@ JDK 17, Spark 4.0 ```shell # Create a new conda environment for the client - conda create -n pyspark-client python==3.10 + conda create -n pyspark-client python==3.11 conda activate pyspark-client # Install the PySpark client package @@ -50,10 +50,10 @@ including setting up the server and running client-side tests. To start the Spark Connect server with Spark Rapids ML support, follow these steps: ```shell -conda activate rapids-25.12 # from spark-rapids-ml installation +conda activate rapids-26.06 # from spark-rapids-ml installation export SPARK_HOME= export PYSPARK_PYTHON=$(which python) -export PLUGIN_JAR=$(pip show spark-rapids-ml | grep Location: | cut -d ' ' -f 2 )/spark_rapids_ml/jars/com.nvidia.rapids.ml-25.12.0.jar +export PLUGIN_JAR=$(pip show spark-rapids-ml | grep Location: | cut -d ' ' -f 2 )/spark_rapids_ml/jars/com.nvidia.rapids.ml-26.06.0.jar $SPARK_HOME/sbin/start-connect-server.sh --master local[*] \ --jars $PLUGIN_JAR \ --conf spark.driver.memory=20G @@ -107,7 +107,7 @@ mvn clean package -DskipTests if you would like to compile the plugin and run the unit tests, install `spark-rapids-ml` python package and its dependencies per the above instructions and run the following command: ``` shell -conda activate rapids-25.12 +conda activate rapids-26.06 export PYSPARK_PYTHON=$(which python) mvn clean package ``` diff --git a/jvm/pom.xml b/jvm/pom.xml index 09aae8bf..0f2b1fc5 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -1,6 +1,6 @@