Skip to content

Commit ca81ff5

Browse files
authored
[Core] manage nccl via a pypi package & upgrade to pt 2.2.1 (vllm-project#3805)
1 parent b778200 commit ca81ff5

File tree

8 files changed

+36
-11
lines changed

8 files changed

+36
-11
lines changed

Diff for: .github/workflows/publish.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
matrix:
5050
os: ['ubuntu-20.04']
5151
python-version: ['3.8', '3.9', '3.10', '3.11']
52-
pytorch-version: ['2.1.2'] # Must be the most recent version that meets requirements.txt.
52+
pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements.txt.
5353
cuda-version: ['11.8', '12.1']
5454

5555
steps:

Diff for: CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
3131
# requirements.txt files and should be kept consistent. The ROCm torch
3232
# versions are derived from Dockerfile.rocm
3333
#
34-
set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
34+
set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
3535
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
3636
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
3737

Diff for: Dockerfile

+7-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,13 @@ RUN --mount=type=cache,target=/root/.cache/pip \
2424
COPY requirements-dev.txt requirements-dev.txt
2525
RUN --mount=type=cache,target=/root/.cache/pip \
2626
pip install -r requirements-dev.txt
27+
28+
# cuda arch list used by torch
29+
# can be useful for both `dev` and `test`
30+
# explicitly set the list to avoid issues with torch 2.2
31+
# see https://github.com/pytorch/pytorch/pull/123243
32+
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
33+
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
2734
#################### BASE BUILD IMAGE ####################
2835

2936

@@ -47,9 +54,6 @@ COPY requirements.txt requirements.txt
4754
COPY pyproject.toml pyproject.toml
4855
COPY vllm/__init__.py vllm/__init__.py
4956

50-
# cuda arch list used by torch
51-
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
52-
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
5357
# max jobs used by Ninja to build extensions
5458
ARG max_jobs=2
5559
ENV MAX_JOBS=${max_jobs}

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ requires = [
55
"ninja",
66
"packaging",
77
"setuptools >= 49.4.0",
8-
"torch == 2.1.2",
8+
"torch == 2.2.1",
99
"wheel",
1010
]
1111
build-backend = "setuptools.build_meta"

Diff for: requirements-build.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ cmake>=3.21
33
ninja
44
packaging
55
setuptools>=49.4.0
6-
torch==2.1.2
6+
torch==2.2.1
77
wheel

Diff for: requirements.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ psutil
44
ray >= 2.9
55
sentencepiece # Required for LLaMA tokenizer.
66
numpy
7-
torch == 2.1.2
7+
torch == 2.2.1
88
requests
99
py-cpuinfo
1010
transformers >= 4.39.1 # Required for StarCoder2 & Llava.
11-
xformers == 0.0.23.post1 # Required for CUDA 12.1.
11+
xformers == 0.0.25 # Requires PyTorch 2.2.1.
1212
fastapi
1313
uvicorn[standard]
1414
pydantic >= 2.0 # Required for OpenAI server.
@@ -17,3 +17,4 @@ pynvml == 11.5.0
1717
triton >= 2.1.0
1818
outlines == 0.0.34
1919
tiktoken == 0.6.0 # Required for DBRX tokenizer
20+
vllm-nccl-cu12>=2.18<2.19 # for downloading nccl library

Diff for: setup.py

+10
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,16 @@ def get_requirements() -> List[str]:
328328
if _is_cuda():
329329
with open(get_path("requirements.txt")) as f:
330330
requirements = f.read().strip().split("\n")
331+
cuda_major = torch.version.cuda.split(".")[0]
332+
modified_requirements = []
333+
for req in requirements:
334+
if "vllm-nccl-cu12" in req:
335+
modified_requirements.append(
336+
req.replace("vllm-nccl-cu12",
337+
f"vllm-nccl-cu{cuda_major}"))
338+
else:
339+
modified_requirements.append(req)
340+
requirements = modified_requirements
331341
elif _is_hip():
332342
with open(get_path("requirements-rocm.txt")) as f:
333343
requirements = f.read().strip().split("\n")

Diff for: vllm/model_executor/parallel_utils/pynccl.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import ctypes
2323
import datetime
24+
import glob
2425
import os
2526

2627
# ===================== import region =====================
@@ -34,18 +35,27 @@
3435

3536
so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
3637

38+
# check if we have vllm-managed nccl
39+
vllm_nccl_path = None
40+
if torch.version.cuda is not None:
41+
cuda_major = torch.version.cuda.split(".")[0]
42+
path = os.path.expanduser(
43+
f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
44+
files = glob.glob(path)
45+
vllm_nccl_path = files[0] if files else None
46+
3747
# manually load the nccl library
3848
if so_file:
3949
logger.info(
4050
f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}")
4151
else:
4252
if torch.version.cuda is not None:
43-
so_file = "libnccl.so.2"
53+
so_file = vllm_nccl_path or "libnccl.so.2"
4454
elif torch.version.hip is not None:
4555
so_file = "librccl.so.1"
4656
else:
4757
raise ValueError("NCCL only supports CUDA and ROCm backends.")
48-
logger.debug(f"Loading nccl from library {so_file}")
58+
logger.info(f"Loading nccl from library {so_file}")
4959

5060
try:
5161
nccl = ctypes.CDLL(so_file)

0 commit comments

Comments
 (0)