diff --git a/mini.Dockerfile b/mini.Dockerfile index 860f53d..9e96232 100644 --- a/mini.Dockerfile +++ b/mini.Dockerfile @@ -16,6 +16,7 @@ ENV UV_LINK_MODE=copy # Install a single mini-mono venv RUN --mount=type=cache,target=/srv/r8/monobase/uv/cache,id=uv-cache \ + --mount=type=cache,target=/var/cache/monobase/,id=var-cache \ CI_SKIP_CUDA=1 /opt/r8/monobase/run.sh monobase.build --mini ######################################## diff --git a/requirements-user.txt b/requirements-user.txt index 72fd8c4..782a544 100644 --- a/requirements-user.txt +++ b/requirements-user.txt @@ -4,4 +4,4 @@ ipython mypy pyyaml requests -torch==2.3.0 +torch==2.5.1 diff --git a/script/build-cuda b/script/build-cuda new file mode 100755 index 0000000..396c757 --- /dev/null +++ b/script/build-cuda @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# Build pre-pruned CUDA & CuDNN tarballs + +set -euo pipefail + +MONOBASE_PYTHON='3.13' + +cd "$(git rev-parse --show-toplevel)" + +mkdir -p build/monobase build/cache + +docker run --rm \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD/src/monobase:/opt/r8/monobase" \ + --volume "$PWD/build/monobase:/srv/r8/monobase" \ + --volume "$PWD/build/cache:/var/cache/monobase" \ + monobase:latest \ + /opt/r8/monobase/run.sh \ + monobase.cuda \ + "$@" diff --git a/src/monobase/__main__.py b/src/monobase/__main__.py index febf349..099a0dc 100644 --- a/src/monobase/__main__.py +++ b/src/monobase/__main__.py @@ -10,6 +10,8 @@ python -m monobase.build + python -m monobase.cuda + python -m monobase.diff python -m monobase.monogen diff --git a/src/monobase/build.py b/src/monobase/build.py index c3c59d9..73fff06 100644 --- a/src/monobase/build.py +++ b/src/monobase/build.py @@ -147,12 +147,14 @@ def build_generation(args: argparse.Namespace, mg: MonoGen) -> None: suffix = '' if args.environment == 'prod' else f'-{args.environment}' rdir = os.path.join(HERE, f'requirements{suffix}', f'g{mg.id:05d}') - if args.mini and len(mg.torch) == 1 and len(mg.cuda) == 0: + cuda_versions = desc_version(mg.cuda.keys()) + if args.mini: # Mini mono with Torch but without CUDA or CuDNN, use CPU Torch - cuda_versions = ['cpu'] + if len(mg.torch) == 1 and len(mg.cuda) == 0: + cuda_versions = ['cpu'] else: # Production, always add CPU torch - cuda_versions = ['cpu'] + desc_version(mg.cuda.keys()) + cuda_versions = ['cpu'] + cuda_versions for (p, pf), t, c in itertools.product( desc_version_key(mg.python), diff --git a/src/monobase/cuda.py b/src/monobase/cuda.py index 5e7b2bd..f84c1a3 100644 --- a/src/monobase/cuda.py +++ b/src/monobase/cuda.py @@ -6,9 +6,17 @@ import subprocess import urllib.parse from dataclasses import dataclass +from multiprocessing import Pool from monobase.urls import cuda_urls, cudnn_urls -from monobase.util import Version, mark_done, require_done_or_rm +from monobase.util import ( + Version, + mark_done, + require_done_or_rm, + setup_logging, +) + +R8_PACKAGE_PREFIX = 'https://monobase-packages.replicate.delivery' log = logging.getLogger(__name__) @@ -65,26 +73,57 @@ def build_cudnns() -> dict[str, CuDNN]: CUDNNS: dict[str, CuDNN] = build_cudnns() -def install_cuda(args: argparse.Namespace, version: str) -> str: - cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}') - if require_done_or_rm(cdir): - log.info(f'CUDA {version} in {cdir} is complete') - return cdir +def tar_and_delete(path: str, file: str) -> None: + # https://www.gnu.org/software//tar/manual/html_section/Reproducibility.html + tar_flags = [ + '--sort=name', + '--format=posix', + '--pax-option=exthdr.name=%d/PaxHeaders/%f', + '--pax-option=delete=atime,delete=ctime,delete=btime,delete=mtime', + '--mtime=0', + '--numeric-owner', + '--owner=0', + '--group=0', + '--mode=go+u,go-w', + ] + tar_env = { + 'LC_ALL': 'C', + 'TZ': 'UTC', + } + cmd = ( + ['tar', '-C', path] + + tar_flags + + ['--zstd', '-cf', file] + + sorted(os.listdir(path)) + ) + subprocess.run(cmd, check=True, env=tar_env) + shutil.rmtree(path, ignore_errors=True) - if os.environ.get('CI_SKIP_CUDA') is not None: - os.makedirs(cdir, exist_ok=True) - mark_done(cdir, kind='cuda', version=version, skipped=True) - log.info(f'CUDA {version} skipped in {cdir}') - return cdir + +def pget(args: argparse.Namespace, url: str, file: str) -> None: + cmd = [ + f'{args.prefix}/bin/pget', + '--pid-file', + '/tmp/pget.pid', + url, + file, + ] + subprocess.run(cmd, check=True) + + +def build_cuda_tarball(args: argparse.Namespace, version: str) -> None: + tf = os.path.join(args.cache, 'cuda', f'monobase-cuda-{version}.tar.zst') + if os.path.exists(tf): + return cuda = CUDAS[version] - file = os.path.join(args.cache, cuda.filename) + file = os.path.join(args.cache, 'cuda', cuda.filename) if not os.path.exists(file): log.info(f'Downloading CUDA {version}...') - cmd = [f'{args.prefix}/bin/pget', '--pid-file', '/tmp/pget.pid', cuda.url, file] - subprocess.run(cmd, check=True) + pget(args, cuda.url, file) log.info(f'Installing CUDA {version}...') + cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}') cmd = [ '/bin/sh', file, @@ -115,7 +154,60 @@ def install_cuda(args: argparse.Namespace, version: str) -> str: cmd = ['find', cdir, '-name', 'lib*.a', '-delete'] subprocess.run(cmd, check=True) - mark_done(cdir, kind='cuda', version=version, url=cuda.url) + log.info(f'Creating CUDA tarball {tf}...') + tar_and_delete(cdir, tf) + + +def build_cudnn_tarball( + args: argparse.Namespace, version: str, cuda_major: str +) -> None: + key = f'{version}-cuda{cuda_major}' + tf = os.path.join(args.cache, 'cudnn', f'monobase-cudnn-{key}.tar.zst') + if os.path.exists(tf): + return + + cudnn = CUDNNS[key] + file = os.path.join(args.cache, 'cudnn', cudnn.filename) + if not os.path.exists(file): + log.info(f'Downloading CuDNN {key}...') + pget(args, cudnn.url, file) + + log.info(f'Installing CuDNN {key}...') + cdir = os.path.join(args.prefix, 'cuda', f'cudnn-{key}') + os.makedirs(cdir, exist_ok=True) + cmd = ['tar', '-xf', file, '--strip-components=1', '--exclude=lib*.a', '-C', cdir] + subprocess.run(cmd, check=True) + + log.info(f'Creating CuDNN tarball {tf}...') + tar_and_delete(cdir, tf) + + +def install_cuda(args: argparse.Namespace, version: str) -> str: + cdir = os.path.join(args.prefix, 'cuda', f'cuda-{version}') + if require_done_or_rm(cdir): + log.info(f'CUDA {version} in {cdir} is complete') + return cdir + + if os.environ.get('CI_SKIP_CUDA') is not None: + os.makedirs(cdir, exist_ok=True) + mark_done(cdir, kind='cuda', version=version, skipped=True) + log.info(f'CUDA {version} skipped in {cdir}') + return cdir + + filename = f'monobase-cuda-{version}.tar.zst' + path = os.path.join(args.cache, 'cuda', filename) + url = f'file://{path}' + if not os.path.exists(path): + log.info(f'Downloading CUDA {version}...') + url = f'{R8_PACKAGE_PREFIX}/cuda/{filename}' + pget(args, url, path) + + log.info(f'Installing CUDA {version}...') + os.makedirs(cdir, exist_ok=True) + cmd = ['tar', '-xf', path, '-C', cdir] + subprocess.run(cmd, check=True) + + mark_done(cdir, kind='cuda', version=version, url=url) log.info(f'CUDA {version} installed in {cdir}') return cdir @@ -133,24 +225,55 @@ def install_cudnn(args: argparse.Namespace, version: str, cuda_major: str) -> st log.info(f'CuDNN {key} skipped in {cdir}') return cdir - cudnn = CUDNNS[key] - file = os.path.join(args.cache, cudnn.filename) - if not os.path.exists(file): + filename = f'monobase-cudnn-{key}.tar.zst' + path = os.path.join(args.cache, 'cudnn', filename) + url = f'file://{path}' + if not os.path.exists(path): log.info(f'Downloading CuDNN {key}...') - cmd = [ - f'{args.prefix}/bin/pget', - '--pid-file', - '/tmp/pget.pid', - cudnn.url, - file, - ] - subprocess.run(cmd, check=True) + url = f'{R8_PACKAGE_PREFIX}/cudnn/{filename}' + pget(args, url, path) log.info(f'Installing CuDNN {key}...') os.makedirs(cdir, exist_ok=True) - cmd = ['tar', '-xf', file, '--strip-components=1', '--exclude=lib*.a', '-C', cdir] + cmd = ['tar', '-xf', path, '-C', cdir] subprocess.run(cmd, check=True) - mark_done(cdir, kind='cudnn', version=version, url=cudnn.url) + mark_done(cdir, kind='cudnn', version=version, url=url) log.info(f'CuDNN {key} installed in {cdir}') return cdir + + +parser = argparse.ArgumentParser(description='Build monobase environment') +parser.add_argument( + '--prefix', + metavar='PATH', + default='/srv/r8/monobase', + help='prefix for monobase', +) +parser.add_argument( + '--cache', + metavar='PATH', + default='/var/cache/monobase', + help='cache for monobase', +) + + +def build_tarballs(args: argparse.Namespace) -> None: + with Pool() as pool: + results = [] + os.makedirs(os.path.join(args.cache, 'cuda'), exist_ok=True) + for k in CUDAS.keys(): + r = pool.apply_async(build_cuda_tarball, (args, k)) + results.append(r) + os.makedirs(os.path.join(args.cache, 'cudnn'), exist_ok=True) + for v in CUDNNS.values(): + a = (args, str(v.cudnn_version), str(v.cuda_major)) + r = pool.apply_async(build_cudnn_tarball, a) + results.append(r) + for r in results: + r.wait() + + +if __name__ == '__main__': + setup_logging() + build_tarballs(parser.parse_args()) diff --git a/src/monobase/urls.py b/src/monobase/urls.py index 7d14507..83a1483 100644 --- a/src/monobase/urls.py +++ b/src/monobase/urls.py @@ -18,6 +18,7 @@ def getenv_or(key: str, default: str) -> str: ) cuda_urls = [ + f'{cuda_prefix}/cuda_12.6.3_560.35.05_linux.run', f'{cuda_prefix}/cuda_12.6.2_560.35.03_linux.run', f'{cuda_prefix}/cuda_12.6.1_560.35.03_linux.run', f'{cuda_prefix}/cuda_12.6.0_560.28.03_linux.run', @@ -50,6 +51,10 @@ def getenv_or(key: str, default: str) -> str: ) cudnn_urls = [ + f'{cudnn_prefix}/cudnn-linux-x86_64-9.6.0.74_cuda12-archive.tar.xz', + f'{cudnn_prefix}/cudnn-linux-x86_64-9.6.0.74_cuda11-archive.tar.xz', + f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.1.17_cuda12-archive.tar.xz', + f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.1.17_cuda11-archive.tar.xz', f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.0.50_cuda12-archive.tar.xz', f'{cudnn_prefix}/cudnn-linux-x86_64-9.5.0.50_cuda11-archive.tar.xz', f'{cudnn_prefix}/cudnn-linux-x86_64-9.4.0.58_cuda12-archive.tar.xz', diff --git a/src/monobase/user.py b/src/monobase/user.py index dd5964a..ebad118 100644 --- a/src/monobase/user.py +++ b/src/monobase/user.py @@ -11,7 +11,7 @@ require_done_or_rm, setup_logging, ) -from monobase.uv import cuda_suffix +from monobase.uv import cuda_suffix, index_args log = logging.getLogger(__name__) @@ -81,14 +81,15 @@ def build_user_venv(args: argparse.Namespace) -> None: subprocess.run(cmd, check=True, env=env) log.info(f'Compiling user requirements {args.requirements}...') - cmd = [ - uv, - 'pip', - 'compile', - '--python-platform', - 'x86_64-unknown-linux-gnu', - args.requirements, - ] + cmd = [uv, 'pip', 'compile', '--python-platform', 'x86_64-unknown-linux-gnu'] + # PyPI is inconsistent with Torch index and may include nvidia packages for CPU torch + # Use the same Torch index instead + tv = ( + Version.parse('0.0.0') + if torch_version is None + else Version.parse(torch_version) + ) + cmd = cmd + index_args(tv, cuda_version) + [args.requirements] env['VIRTUAL_ENV'] = udir try: proc = subprocess.run(cmd, check=True, env=env, capture_output=True, text=True)