diff --git a/build_tools/utils.py b/build_tools/utils.py index d0f5eab425..f2548b4de6 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -14,7 +14,7 @@ import sys import platform from pathlib import Path -from importlib.metadata import version as get_version +from importlib.metadata import PackageNotFoundError, distribution, version as get_version from subprocess import CalledProcessError from typing import List, Optional, Tuple, Union @@ -292,10 +292,17 @@ def cuda_version() -> Tuple[int, ...]: version_str = get_version("nvidia-cuda-runtime-cu12") version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) return version_tuple - except importlib.metadata.PackageNotFoundError: + except PackageNotFoundError: raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") +def cusolvermp_pypi_package_name(cuda_major: Optional[int] = None) -> str: + """PyPI package providing cuSolverMp runtime libraries for a CUDA major version.""" + if cuda_major is None: + cuda_major = cuda_version()[0] + return f"nvidia-cusolvermp-cu{cuda_major}" + + def get_frameworks() -> List[str]: """DL frameworks to build support for""" _frameworks: List[str] = [] diff --git a/build_tools/wheel_utils/Dockerfile.aarch b/build_tools/wheel_utils/Dockerfile.aarch index c040dadcdb..4f6635f507 100644 --- a/build_tools/wheel_utils/Dockerfile.aarch +++ b/build_tools/wheel_utils/Dockerfile.aarch @@ -35,12 +35,23 @@ RUN dnf clean all RUN dnf -y install glog.aarch64 glog-devel.aarch64 RUN dnf -y install libnccl libnccl-devel libnccl-static +# expose system libs for TE CMake build. +RUN dnf -y install \ + libcusolvermp0-cuda-${CUDA_MAJOR} libcusolvermp0-devel-cuda-${CUDA_MAJOR} && \ + dnf clean all +RUN mkdir -p /opt/nvidia/cusolvermp && \ + ln -s /usr/include/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/include && \ + ln -s /usr/lib64/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/lib && \ + echo "/usr/lib64/libcusolvermp/${CUDA_MAJOR}" > /etc/ld.so.conf.d/999_nvidia_cusolvermp.conf && \ + ldconfig + ENV PATH="/usr/local/cuda/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/opt/nvidia/cusolvermp/lib:${LD_LIBRARY_PATH}" ENV CUDA_HOME=/usr/local/cuda ENV CUDA_ROOT=/usr/local/cuda ENV CUDA_PATH=/usr/local/cuda ENV CUDADIR=/usr/local/cuda +ENV CUSOLVERMP_HOME=/opt/nvidia/cusolvermp ENV NVTE_RELEASE_BUILD=1 CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_aarch64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"] diff --git a/build_tools/wheel_utils/Dockerfile.x86 b/build_tools/wheel_utils/Dockerfile.x86 index 2728b6b7c1..b01e443910 100644 --- a/build_tools/wheel_utils/Dockerfile.x86 +++ b/build_tools/wheel_utils/Dockerfile.x86 @@ -35,12 +35,23 @@ RUN dnf clean all RUN dnf -y install glog.x86_64 glog-devel.x86_64 RUN dnf -y install libnccl libnccl-devel libnccl-static +# expose system libs for TE CMake build. +RUN dnf -y install \ + libcusolvermp0-cuda-${CUDA_MAJOR} libcusolvermp0-devel-cuda-${CUDA_MAJOR} && \ + dnf clean all +RUN mkdir -p /opt/nvidia/cusolvermp && \ + ln -s /usr/include/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/include && \ + ln -s /usr/lib64/libcusolvermp/${CUDA_MAJOR} /opt/nvidia/cusolvermp/lib && \ + echo "/usr/lib64/libcusolvermp/${CUDA_MAJOR}" > /etc/ld.so.conf.d/999_nvidia_cusolvermp.conf && \ + ldconfig + ENV PATH="/usr/local/cuda/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/opt/nvidia/cusolvermp/lib:${LD_LIBRARY_PATH}" ENV CUDA_HOME=/usr/local/cuda ENV CUDA_ROOT=/usr/local/cuda ENV CUDA_PATH=/usr/local/cuda ENV CUDADIR=/usr/local/cuda +ENV CUSOLVERMP_HOME=/opt/nvidia/cusolvermp ENV NVTE_RELEASE_BUILD=1 CMD ["/bin/bash", "-c", "bash /TransformerEngine/build_tools/wheel_utils/build_wheels.sh manylinux_2_28_x86_64 $BUILD_METAPACKAGE $BUILD_COMMON $BUILD_PYTORCH $BUILD_JAX $CUDA_MAJOR"] diff --git a/build_tools/wheel_utils/build_wheels.sh b/build_tools/wheel_utils/build_wheels.sh index ff422e9d2c..17bde4ef17 100644 --- a/build_tools/wheel_utils/build_wheels.sh +++ b/build_tools/wheel_utils/build_wheels.sh @@ -25,6 +25,10 @@ git submodule update --init --recursive # Install deps /opt/python/cp310-cp310/bin/pip install cmake pybind11[global] ninja setuptools wheel +# Enable optional build features. cuSolverMp is provided by the build image +# (see Dockerfile.x86 / Dockerfile.aarch), which also sets CUSOLVERMP_HOME. +export NVTE_WITH_CUSOLVERMP=1 + if $BUILD_METAPACKAGE ; then cd /TransformerEngine NVTE_BUILD_METAPACKAGE=1 /opt/python/cp310-cp310/bin/python setup.py bdist_wheel 2>&1 | tee /wheelhouse/logs/metapackage.txt diff --git a/setup.py b/setup.py index ec277b6349..7f6b51c148 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ from build_tools.utils import ( cuda_archs, cuda_version, + cusolvermp_pypi_package_name, get_frameworks, remove_dups, min_python_version_str, @@ -109,6 +110,7 @@ def setup_requirements() -> Tuple[List[str], List[str]]: "pydantic", "importlib-metadata>=1.0", "packaging", + cusolvermp_pypi_package_name(), ] test_reqs: List[str] = ["pytest>=8.2.1"] diff --git a/transformer_engine/common/__init__.py b/transformer_engine/common/__init__.py index 40933f17a9..fd2d146616 100644 --- a/transformer_engine/common/__init__.py +++ b/transformer_engine/common/__init__.py @@ -255,6 +255,29 @@ def _nvidia_cudart_include_dir() -> str: return str(include_dir) if include_dir.exists() else "" +@functools.lru_cache(maxsize=None) +def _is_cusolvermp_installed_in_system() -> bool: + """Check if cuSolverMp is registered in the system library cache.""" + + if platform.system() != "Linux": + return False + + try: + result = subprocess.run( + ["ldconfig", "-p"], + capture_output=True, + text=True, + check=False, + ) + except (OSError, subprocess.SubprocessError): + return False + + if result.returncode != 0: + return False + + return any("cusolvermp" in line.lower() for line in result.stdout.splitlines()) + + @functools.lru_cache(maxsize=None) def _load_cuda_library_from_python(lib_name: str, strict: bool = False): """ @@ -369,6 +392,11 @@ def _load_core_library(): _, _CUDNN_LIB_CTYPES = _load_cuda_library("cudnn") system_nvrtc, _NVRTC_LIB_CTYPES = _load_cuda_library("nvrtc") system_curand, _CURAND_LIB_CTYPES = _load_cuda_library("curand") + _CUSOLVERMP_LIB_CTYPES = None + if not _is_cusolvermp_installed_in_system() and any( + _is_package_installed(p) for p in ("nvidia-cusolvermp-cu12", "nvidia-cusolvermp-cu13") + ): + _, _CUSOLVERMP_LIB_CTYPES = _load_cuda_library_from_python("cusolverMp", strict=False) # This additional step is necessary to be able to install TE wheels # and import TE (without any guards) in an environment where the cuda