diff --git a/docker/build_cpu_image.sh b/docker/build_cpu_image.sh new file mode 100755 index 000000000..53d72fdf1 --- /dev/null +++ b/docker/build_cpu_image.sh @@ -0,0 +1,25 @@ +# Current directory should be the root of the repository + +# You may need to pass +# - VLLM_VERSION +# - MACA_VERSION +# to build a specific version + +RHSM_SECRET_FILE=${RHSM_SECRET_FILE:-rhsm.env} +if [ ! -f "$RHSM_SECRET_FILE" ] && [ -f docker/rhsm.env ]; then + RHSM_SECRET_FILE=docker/rhsm.env +fi + +DOCKER_BUILDKIT=1 docker build \ + --network host \ + --secret id=rhsm,src=${RHSM_SECRET_FILE} \ + -f docker/vllm-cpu.Dockerfile \ + -t vllm_cpu:new_base_img \ + --build-arg VLLM_VERSION=v0.18.0 \ + . + +# debug dockerfile and run into shell with buildx: +# ddocker () { +# BUILDX_EXPERIMENTAL=1 docker buildx debug --invoke /bin/bash --on=error $@ +# } + diff --git a/docker/rhsm.env b/docker/rhsm.env new file mode 100644 index 000000000..318e2f633 --- /dev/null +++ b/docker/rhsm.env @@ -0,0 +1,2 @@ +RHSM_USER='' +RHSM_PASS='' diff --git a/docker/vllm-cpu.Dockerfile b/docker/vllm-cpu.Dockerfile new file mode 100644 index 000000000..0dcd502b5 --- /dev/null +++ b/docker/vllm-cpu.Dockerfile @@ -0,0 +1,140 @@ +ARG BUILD_BASE_IMAGE=registry.redhat.io/rhai/base-image-cpu-rhel9:3.4.0-1777399554 +ARG PYTHON_VERSION=3.12 +# ARG UV_EXTRA_INDEX_URL=https://console.redhat.com/api/pypi/public-rhai/rhoai/3.4/cpu-ubi9/simple +ARG UV_INDEX_URL=https://console.redhat.com/api/pypi/public-rhai/rhoai/3.4/cpu-ubi9/simple +# may need passing a particular vllm version during build +ARG VLLM_VERSION + +ARG RHSM_USER +ARG RHSM_PASS + +#################### BASE IMAGE #################### +FROM ${BUILD_BASE_IMAGE} AS base + + +# # TODO: we most likely do not need all of the dnf installs below: they're already in the base image +# RUN --mount=type=secret,id=rhsm \ +# --mount=type=cache,target=/var/cache/dnf,sharing=locked \ +# bash -euo pipefail -c '\ +# source /run/secrets/rhsm; \ +# cleanup() { \ +# subscription-manager unregister >/dev/null 2>&1 || true; \ +# subscription-manager clean >/dev/null 2>&1 || true; \ +# }; \ +# trap cleanup EXIT; \ +# subscription-manager register \ +# --username "$RHSM_USER" \ +# --password "$RHSM_PASS" \ +# --auto-attach; \ +# subscription-manager repos \ +# --enable codeready-builder-for-rhel-9-x86_64-rpms; \ +# yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm; \ +# /usr/bin/crb enable; \ +# yum makecache; \ +# yum install -y --setopt=install_weak_deps=False \ +# python3 \ +# python3-devel \ +# zeromq \ +# bzip2 \ +# cpio \ +# elfutils-debuginfod-client \ +# ffmpeg-free \ +# fftw \ +# file \ +# freetype \ +# gcc \ +# gcc-c++ \ +# gdal-libs \ +# gdb \ +# geos \ +# git-core \ +# glibc-langpack-en \ +# glog \ +# gmp \ +# gzip \ +# hdf5 \ +# jemalloc \ +# jq \ +# krb5-libs \ +# lcms2 \ +# libaio \ +# libev \ +# libjpeg \ +# libmpc \ +# libomp \ +# libpng \ +# libpq \ +# libqhull_r \ +# libsndfile \ +# libtiff \ +# libunwind \ +# libva \ +# libwebp \ +# libxml2 \ +# libxslt \ +# libzip \ +# libzstd \ +# loguru \ +# lz4 \ +# make \ +# mariadb-connector-c \ +# mpfr \ +# netcdf \ +# numactl \ +# nvtop \ +# openblas openblas-openmp openblas-openmp64 openblas-serial openblas-serial64 openblas-threads openblas-threads64 \ +# openjpeg2 \ +# openmpi \ +# proj \ +# protobuf \ +# qpdf \ +# re2 \ +# snappy \ +# spatialindex \ +# tbb \ +# tesseract \ +# thrift \ +# unixODBC \ +# utf8proc \ +# wget \ +# xz \ +# xz-libs \ +# zlib \ +# zstd; \ +# yum clean all' + +WORKDIR /workspace + +RUN curl -LsSf https://astral.sh/uv/install.sh | sh + +ENV CC=/usr/bin/gcc CXX=/usr/bin/g++ + +ENV PATH="/root/.local/bin:$PATH" +ENV VIRTUAL_ENV="/opt/app-root/" +ENV UV_PYTHON_INSTALL_DIR=/opt/app-root/python +ARG PYTHON_VERSION +# RUN uv venv ${VIRTUAL_ENV} --python ${PYTHON_VERSION} --seed +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +ENV UV_HTTP_TIMEOUT=500 + +ENV LD_PRELOAD="/opt/app-root/lib/libiomp5.so" + +RUN echo 'ulimit -c 0' >> ~/.bashrc + +###################### BUILD IMAGE #################### +FROM base AS vllm-cpu-build + + +# Install Python dependencies +ARG UV_EXTRA_INDEX_URL +ARG UV_INDEX_URL +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} +ENV UV_INDEX_URL=${UV_INDEX_URL} +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_LINK_MODE="copy" + +ENV VLLM_LOGGING_LEVEL=DEBUG + +ARG VLLM_VERSION +RUN uv pip install --torch-backend=cpu vllm==${VLLM_VERSION} diff --git a/docker/vllm_ubi9_building_instructions.md b/docker/vllm_ubi9_building_instructions.md deleted file mode 100644 index 670a07b26..000000000 --- a/docker/vllm_ubi9_building_instructions.md +++ /dev/null @@ -1,284 +0,0 @@ -Here we demonstrate how to build vLLM manually starting from a ubi9 docker image. - -# 0. start a base image container - -## start container -```bash -# you may modify privileged option and mount only specific GPU cards. -# please refer to our docucments on https://developer.metax-tech.com -docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \ - --shm-size 100gb --ulimit memlock=-1 \ - --security-opt seccomp=unconfined --security-opt apparmor=unconfined \ - --device=/dev/dri --device=/dev/mxcd \ - -v /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro \ - --name base_image \ - registry.access.redhat.com/ubi9/ubi:9.6 bash -``` -Some packages needs subscription, in the following steps we ignore these packages. - - - -# 1. Installing Metax-Driver - -Reference: [MACA Repo](https://repos.metax-tech.com/gitea/repos/index/wiki/MACA.md#metax-driver) - -```bash -# add repo source -cat < /etc/yum.repos.d/metax-driver-centos.repo -[metax-centos] -name=Maca Driver Yum Repository -baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/ -enabled=1 -gpgcheck=0 -EOF - -# would install the newest 3.1.0.x release -# Metax-Driver mainly contains vbios and kmd file, which are not needed in a container. -# Here we want to get the mx-smi management tool. -# kernel version mismatch errors are ignored -yum makecache -yum install -y metax-driver mxgvm - -# check -rpm -qa | egrep "(metax|mxsmt|mxfw)" -``` - - -# 2. Installing MACA SDK - -Reference: [MACA Repo](https://repos.metax-tech.com/gitea/repos/index/wiki/MACA.md#metax-driver) - -may need some time according to your network speed - -```bash -cat < /etc/yum.repos.d/maca-sdk-rpm.repo -[maca-sdk] -name=Maca Sdk Yum Repository -baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/ -enabled=1 -gpgcheck=0 -EOF - -yum makecache - -yum install -y maca_sdk - -rpm -qa | egrep "maca_sdk" - -# you may install specific MACA SDK version like this -yum --showduplicates list |grep maca_sdk -yum install maca_sdk-- -``` - - -# 3. Install torch - -Reference: [MACA Repo](https://repos.metax-tech.com/gitea/repos/index/wiki/MACA.md#metax-driver) - - -## 3.1 Setup a python environment using uv - -Our internal build pipeline uses conda. Here we use uv instead. - -Attention: NOT fully tested! - -```bash -curl -LsSf https://astral.sh/uv/install.sh | sh - -uv venv /opt/venv --python 3.10 -source /opt/venv/bin/activate -``` - - -## 3.2 Install dependent yum packages - -The following packages need subscription, so we just SKIP them. It is about lapack and IB/RDMA utils, NOT fully tested, but should not cause problems. - -* `lapack-devel librdmacm-utils libibverbs-utils` - -```bash -yum makecache && yum install -y \ - openblas-devel \ - gcc-c++ \ - libibverbs librdmacm libibumad openssh-server \ - && yum clean all -``` - -## 3.3 Install cu-bridge - -Cu-bridge is our cuda compatible package used to compile cuda code. Before installing torch, cu-bridge need to be installed. - -Please refer to [cu-bridge/02_User_Manual](https://gitee.com/metax-maca/cu-bridge/tree/master/docs/02_User_Manual) - - -```bash -# you may separate the building process in a single stage. - -yum install -y wget unzip cmake - -export MACA_PATH=/opt/maca - -wget https://gitee.com/metax-maca/cu-bridge/repository/archive/3.1.0.zip -unzip 3.1.0.zip -mv cu-bridge-3.1.0 cu-bridge -chmod 755 cu-bridge -Rf -cd cu-bridge -mkdir build && cd ./build -cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ -make && make install -``` - - -## 3.4 Some import environment settings - -You need to set the following envs to make sure maca-pytorch running properly. The cucc parts are manily used for compiling. - -```bash -export MACA_PATH=/opt/maca - -export MACA_CLANG_PATH=${MACA_PATH}/mxgpu_llvm/bin - -export CUCC_PATH=${MACA_PATH}/tools/cu-bridge -export CUDA_PATH=${CUCC_PATH} -export PATH=$PATH:${CUCC_PATH}/tools:${CUCC_PATH}/bin - -export PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_CLANG_PATH}:${PATH} -export LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH} -``` - - -## 3.5 Install torch using uv - -1. Currently in our internal CI/CD flow, we install from local wheel packages. Here we install packages from metax pypi source. -2. Metax maca version is added to package's local version, e.g., "+metax3.1.0.4torch2.6", and packages's version should match with the installed maca SDK version. -3. The only way to specify maca version is giving the full version name. It would be better to use separate channels for different maca versions, which is in our plan. - - -```bash - -# `datasets` only has 3.1.0 in metax's pypi repo. first install from pulbic source. -uv pip install datasets==4.1.1 - -cat < ./requirements.txt -apex==0.1+metax3.1.0.4 -causal_conv1d==1.5.0.post8+metax3.1.0.4torch2.6 -dropout_layer_norm==0.1+metax3.1.0.4torch2.6 -flash_attn==2.6.3+metax3.1.0.4torch2.6 -flash_linear_attention==0.1+metax3.1.0.4torch2.6 -flash_mla==1.0.1+metax3.1.0.4torch2.6 -flashinfer==0.2.2.post1+metax3.1.0.4torch2.6 -fused_dense_lib==2.6.3+metax3.1.0.4torch2.6 -mamba_ssm==2.2.4+metax3.1.0.4torch2.6 -mctlassEx==0.1.1+metax3.1.0.4torch2.6 -rotary_emb==0.1+metax3.1.0.4torch2.6 -sageattention==2.0.1+metax3.1.0.4torch2.6 -spconv==2.1.0+metax3.1.0.4torch2.6 -torch==2.6.0+metax3.1.0.4 -torchaudio==2.4.1+metax3.1.0.4 -torchvision==0.15.1+metax3.1.0.4 -triton==3.0.0+metax3.1.0.4 -xentropy_cuda_lib==0.1+metax3.1.0.4torch2.6 -xformers==0.0.22+metax3.1.0.4torch2.6 -EOF - -uv pip install -r ./requirements.txt -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com - -# torch need to use numpy < 2.0 -uv pip install numpy==1.26.4 -``` - -You may try the following commands: - -```bash -# Search for available versions: -pip index versions mcspconv -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com - -# install a package -uv pip install torch torchaudio torchvision -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com -``` - -Check torch installing result: - -```bash -python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count());" -``` - - - -# 4. Build and install vllm - -Reference: [vLLM-metax](https://github.com/MetaX-MACA/vLLM-metax) - - -```bash -yum install -y vim zip wget tar tzdata \ - make cmake ninja-build gcc gcc-c++ procps-ng libxml2 openssh-server libXau \ - openblas-devel \ - libibverbs librdmacm libibumad \ - && yum clean all - -uv pip install /opt/maca/share/mxsml/pymxsml-*.whl -uv pip install tokenizers==0.20.3 orjson==3.10.6 - - -yum install -y git - -# build or install vllm -uv pip install vllm==0.10.2 --no-deps - - -git clone --depth 1 --branch v0.10.2 https://github.com/MetaX-MACA/vLLM-metax.git -cd vLLM-metax - -# build vllm on maca needs cuda 11.6 -wget https://developer.download.nvidia.com/compute/cuda/11.6.0/local_installers/cuda_11.6.0_510.39.01_linux.run && \ - sh cuda_11.6.0_510.39.01_linux.run --silent --toolkit && \ - rm cuda_11.6.0_510.39.01_linux.run - - -# setup MACA path -export MACA_PATH="/opt/maca" - -# setup CUDA && cu-bridge -export CUDA_PATH="/usr/local/cuda" -export CUCC_PATH="${MACA_PATH}/tools/cu-bridge" - -# update PATH -export PATH=${CUDA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${PATH} -export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH} - -export VLLM_INSTALL_PUNICA_KERNELS=1 - - -# install requirements for building -uv pip install -r requirements/build.txt -# build wheels -python setup.py bdist_wheel -# install wheels -uv pip install dist/*.whl - -``` - - -## 5. Install ray - - -```bash -yum install -y patch - -uv pip install click==8.2.1 - -# the following packages cannot be installed. SKIP -# uv pip install mcpy==2.1.9.4+b3.1.0.14 numbax==2.1.9.4+b3.1.0.14 -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host repos.metax-tech.com - -pip install ray==2.46.0 - -unzip ray-patch.zip -cp -rd ray-patch /workspace -cd /workspace/ray-patch/ray_patch -python apply_ray_patch.py mx_ray_2.46.batch - -if [ -f "/opt/conda/bin/ray" ]; then - ln -sf /opt/conda/bin/ray /bin/ray -fi -```