Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to pre-pruned CUDA/CuDNN tarballs #60

Merged
merged 4 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mini.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ ENV UV_LINK_MODE=copy

# Install a single mini-mono venv
RUN --mount=type=cache,target=/srv/r8/monobase/uv/cache,id=uv-cache \
--mount=type=cache,target=/var/cache/monobase/,id=var-cache \
CI_SKIP_CUDA=1 /opt/r8/monobase/run.sh monobase.build --mini

########################################
Expand Down
2 changes: 1 addition & 1 deletion requirements-user.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ ipython
mypy
pyyaml
requests
torch==2.3.0
torch==2.5.1
21 changes: 21 additions & 0 deletions script/build-cuda
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# Build pre-pruned CUDA & CuDNN tarballs

set -euo pipefail

MONOBASE_PYTHON='3.13'

cd "$(git rev-parse --show-toplevel)"

mkdir -p build/monobase build/cache

docker run --rm \
--user "$(id -u):$(id -g)" \
--volume "$PWD/src/monobase:/opt/r8/monobase" \
--volume "$PWD/build/monobase:/srv/r8/monobase" \
--volume "$PWD/build/cache:/var/cache/monobase" \
monobase:latest \
/opt/r8/monobase/run.sh \
monobase.cuda \
"$@"
2 changes: 2 additions & 0 deletions src/monobase/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

python -m monobase.build

python -m monobase.cuda

python -m monobase.diff

python -m monobase.monogen
Expand Down
8 changes: 5 additions & 3 deletions src/monobase/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,12 +147,14 @@ def build_generation(args: argparse.Namespace, mg: MonoGen) -> None:
suffix = '' if args.environment == 'prod' else f'-{args.environment}'
rdir = os.path.join(HERE, f'requirements{suffix}', f'g{mg.id:05d}')

if args.mini and len(mg.torch) == 1 and len(mg.cuda) == 0:
cuda_versions = desc_version(mg.cuda.keys())
if args.mini:
# Mini mono with Torch but without CUDA or CuDNN, use CPU Torch
cuda_versions = ['cpu']
if len(mg.torch) == 1 and len(mg.cuda) == 0:
cuda_versions = ['cpu']
else:
# Production, always add CPU torch
cuda_versions = ['cpu'] + desc_version(mg.cuda.keys())
cuda_versions = ['cpu'] + cuda_versions

for (p, pf), t, c in itertools.product(
desc_version_key(mg.python),
Expand Down
179 changes: 151 additions & 28 deletions src/monobase/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,17 @@
import subprocess
import urllib.parse
from dataclasses import dataclass
from multiprocessing import Pool

from monobase.urls import cuda_urls, cudnn_urls
from monobase.util import Version, mark_done, require_done_or_rm
from monobase.util import (
Version,
mark_done,
require_done_or_rm,
setup_logging,
)

R8_PACKAGE_PREFIX = 'https://monobase-packages.replicate.delivery'

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -65,26 +73,57 @@ def build_cudnns() -> dict[str, CuDNN]:
CUDNNS: dict[str, CuDNN] = build_cudnns()


def install_cuda(args: argparse.Namespace, version: str) -> str:
cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}')
if require_done_or_rm(cdir):
log.info(f'CUDA {version} in {cdir} is complete')
return cdir
def tar_and_delete(path: str, file: str) -> None:
# https://www.gnu.org/software//tar/manual/html_section/Reproducibility.html
tar_flags = [
'--sort=name',
'--format=posix',
'--pax-option=exthdr.name=%d/PaxHeaders/%f',
'--pax-option=delete=atime,delete=ctime,delete=btime,delete=mtime',
'--mtime=0',
'--numeric-owner',
'--owner=0',
'--group=0',
'--mode=go+u,go-w',
]
tar_env = {
'LC_ALL': 'C',
'TZ': 'UTC',
}
cmd = (
['tar', '-C', path]
+ tar_flags
+ ['--zstd', '-cf', file]
+ sorted(os.listdir(path))
)
subprocess.run(cmd, check=True, env=tar_env)
shutil.rmtree(path, ignore_errors=True)

if os.environ.get('CI_SKIP_CUDA') is not None:
os.makedirs(cdir, exist_ok=True)
mark_done(cdir, kind='cuda', version=version, skipped=True)
log.info(f'CUDA {version} skipped in {cdir}')
return cdir

def pget(args: argparse.Namespace, url: str, file: str) -> None:
cmd = [
f'{args.prefix}/bin/pget',
'--pid-file',
'/tmp/pget.pid',
url,
file,
]
subprocess.run(cmd, check=True)


def build_cuda_tarball(args: argparse.Namespace, version: str) -> None:
tf = os.path.join(args.cache, 'cuda', f'monobase-cuda-{version}.tar.zst')
if os.path.exists(tf):
return

cuda = CUDAS[version]
file = os.path.join(args.cache, cuda.filename)
file = os.path.join(args.cache, 'cuda', cuda.filename)
if not os.path.exists(file):
log.info(f'Downloading CUDA {version}...')
cmd = [f'{args.prefix}/bin/pget', '--pid-file', '/tmp/pget.pid', cuda.url, file]
subprocess.run(cmd, check=True)
pget(args, cuda.url, file)

log.info(f'Installing CUDA {version}...')
cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}')
cmd = [
'/bin/sh',
file,
Expand Down Expand Up @@ -115,7 +154,60 @@ def install_cuda(args: argparse.Namespace, version: str) -> str:
cmd = ['find', cdir, '-name', 'lib*.a', '-delete']
subprocess.run(cmd, check=True)

mark_done(cdir, kind='cuda', version=version, url=cuda.url)
log.info(f'Creating CUDA tarball {tf}...')
tar_and_delete(cdir, tf)


def build_cudnn_tarball(
args: argparse.Namespace, version: str, cuda_major: str
) -> None:
key = f'{version}-cuda{cuda_major}'
tf = os.path.join(args.cache, 'cudnn', f'monobase-cudnn-{key}.tar.zst')
if os.path.exists(tf):
return

cudnn = CUDNNS[key]
file = os.path.join(args.cache, 'cudnn', cudnn.filename)
if not os.path.exists(file):
log.info(f'Downloading CuDNN {key}...')
pget(args, cudnn.url, file)

log.info(f'Installing CuDNN {key}...')
cdir = os.path.join(args.prefix, 'cuda', f'cudnn-{key}')
os.makedirs(cdir, exist_ok=True)
cmd = ['tar', '-xf', file, '--strip-components=1', '--exclude=lib*.a', '-C', cdir]
subprocess.run(cmd, check=True)

log.info(f'Creating CuDNN tarball {tf}...')
tar_and_delete(cdir, tf)


def install_cuda(args: argparse.Namespace, version: str) -> str:
cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}')
if require_done_or_rm(cdir):
log.info(f'CUDA {version} in {cdir} is complete')
return cdir

if os.environ.get('CI_SKIP_CUDA') is not None:
os.makedirs(cdir, exist_ok=True)
mark_done(cdir, kind='cuda', version=version, skipped=True)
log.info(f'CUDA {version} skipped in {cdir}')
return cdir

filename = f'monobase-cuda-{version}.tar.zst'
path = os.path.join(args.cache, 'cuda', filename)
url = f'file://{path}'
if not os.path.exists(path):
log.info(f'Downloading CUDA {version}...')
url = f'{R8_PACKAGE_PREFIX}/cuda/{filename}'
pget(args, url, path)

log.info(f'Installing CUDA {version}...')
os.makedirs(cdir, exist_ok=True)
cmd = ['tar', '-xf', path, '-C', cdir]
subprocess.run(cmd, check=True)

mark_done(cdir, kind='cuda', version=version, url=url)
log.info(f'CUDA {version} installed in {cdir}')
return cdir

Expand All @@ -133,24 +225,55 @@ def install_cudnn(args: argparse.Namespace, version: str, cuda_major: str) -> st
log.info(f'CuDNN {key} skipped in {cdir}')
return cdir

cudnn = CUDNNS[key]
file = os.path.join(args.cache, cudnn.filename)
if not os.path.exists(file):
filename = f'monobase-cudnn-{key}.tar.zst'
path = os.path.join(args.cache, 'cudnn', filename)
url = f'file://{path}'
if not os.path.exists(path):
log.info(f'Downloading CuDNN {key}...')
cmd = [
f'{args.prefix}/bin/pget',
'--pid-file',
'/tmp/pget.pid',
cudnn.url,
file,
]
subprocess.run(cmd, check=True)
url = f'{R8_PACKAGE_PREFIX}/cudnn/{filename}'
pget(args, url, path)

log.info(f'Installing CuDNN {key}...')
os.makedirs(cdir, exist_ok=True)
cmd = ['tar', '-xf', file, '--strip-components=1', '--exclude=lib*.a', '-C', cdir]
cmd = ['tar', '-xf', path, '-C', cdir]
subprocess.run(cmd, check=True)

mark_done(cdir, kind='cudnn', version=version, url=cudnn.url)
mark_done(cdir, kind='cudnn', version=version, url=url)
log.info(f'CuDNN {key} installed in {cdir}')
return cdir


parser = argparse.ArgumentParser(description='Build monobase environment')
parser.add_argument(
'--prefix',
metavar='PATH',
default='/srv/r8/monobase',
help='prefix for monobase',
)
parser.add_argument(
'--cache',
metavar='PATH',
default='/var/cache/monobase',
help='cache for monobase',
)


def build_tarballs(args: argparse.Namespace) -> None:
with Pool() as pool:
results = []
os.makedirs(os.path.join(args.cache, 'cuda'), exist_ok=True)
for k in CUDAS.keys():
r = pool.apply_async(build_cuda_tarball, (args, k))
results.append(r)
os.makedirs(os.path.join(args.cache, 'cudnn'), exist_ok=True)
for v in CUDNNS.values():
a = (args, str(v.cudnn_version), str(v.cuda_major))
r = pool.apply_async(build_cudnn_tarball, a)
results.append(r)
for r in results:
r.wait()


if __name__ == '__main__':
setup_logging()
build_tarballs(parser.parse_args())
5 changes: 5 additions & 0 deletions src/monobase/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def getenv_or(key: str, default: str) -> str:
)

cuda_urls = [
f'{cuda_prefix}/cuda_12.6.3_560.35.05_linux.run',
f'{cuda_prefix}/cuda_12.6.2_560.35.03_linux.run',
f'{cuda_prefix}/cuda_12.6.1_560.35.03_linux.run',
f'{cuda_prefix}/cuda_12.6.0_560.28.03_linux.run',
Expand Down Expand Up @@ -50,6 +51,10 @@ def getenv_or(key: str, default: str) -> str:
)

cudnn_urls = [
f'{cudnn_prefix}/cudnn-linux-x86_64-9.6.0.74_cuda12-archive.tar.xz',
f'{cudnn_prefix}/cudnn-linux-x86_64-9.6.0.74_cuda11-archive.tar.xz',
f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz',
f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.1.17_cuda11-archive.tar.xz',
f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.0.50_cuda12-archive.tar.xz',
f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.0.50_cuda11-archive.tar.xz',
f'{cudnn_prefix}/cudnn-linux-x86_64-9.4.0.58_cuda12-archive.tar.xz',
Expand Down
19 changes: 10 additions & 9 deletions src/monobase/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
require_done_or_rm,
setup_logging,
)
from monobase.uv import cuda_suffix
from monobase.uv import cuda_suffix, index_args

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -81,14 +81,15 @@ def build_user_venv(args: argparse.Namespace) -> None:
subprocess.run(cmd, check=True, env=env)

log.info(f'Compiling user requirements {args.requirements}...')
cmd = [
uv,
'pip',
'compile',
'--python-platform',
'x86_64-unknown-linux-gnu',
args.requirements,
]
cmd = [uv, 'pip', 'compile', '--python-platform', 'x86_64-unknown-linux-gnu']
# PyPI is inconsistent with Torch index and may include nvidia packages for CPU torch
# Use the same Torch index instead
tv = (
Version.parse('0.0.0')
if torch_version is None
else Version.parse(torch_version)
)
cmd = cmd + index_args(tv, cuda_version) + [args.requirements]
env['VIRTUAL_ENV'] = udir
try:
proc = subprocess.run(cmd, check=True, env=env, capture_output=True, text=True)
Expand Down
Loading