-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathDockerfile.train
More file actions
143 lines (115 loc) · 4.75 KB
/
Dockerfile.train
File metadata and controls
143 lines (115 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Dockerfile.train — GPU training container for Crypto Vision
#
# Multi-stage build producing a CUDA-enabled image with all Python
# dependencies for fine-tuning open-source LLMs using LoRA + Unsloth.
#
# Build:
# docker build -f Dockerfile.train -t crypto-vision-trainer .
#
# Run (requires NVIDIA GPU + nvidia-container-toolkit):
# docker run --gpus all \
# -v ./data/training:/training/data \
# -v ./models:/training/models \
# -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN \
# -e GCS_BUCKET=crypto-vision-models \
# crypto-vision-trainer \
# scripts/opensource/train.py \
# --model meta-llama/Llama-3.1-8B-Instruct \
# --data data/prepared/train.jsonl \
# --output models/crypto-vision-llama-8b \
# --epochs 3 --quantize
#
# Copyright 2024-2026 nirholas. All rights reserved.
# ─── Stage 1: Build dependencies ─────────────────────────────
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 AS builder
ENV DEBIAN_FRONTEND=noninteractive
ENV PIP_NO_CACHE_DIR=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
# Install system dependencies needed for building Python packages
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-dev \
python3.11-venv \
python3-pip \
git \
curl \
wget \
build-essential \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.11 as default
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
RUN python3 -m pip install --upgrade pip setuptools wheel
# Install all Python dependencies into a virtual environment
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:${PATH}"
# Install PyTorch with CUDA 12.4 support
RUN pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu124
# Install HuggingFace ecosystem + training dependencies
RUN pip install \
transformers==4.44.0 \
datasets==2.21.0 \
accelerate==0.33.0 \
peft==0.12.0 \
trl==0.9.6 \
bitsandbytes==0.43.3 \
scipy \
sentencepiece \
protobuf
# Install quantization libraries
RUN pip install \
auto-gptq==0.7.1 \
autoawq==0.2.6
# Install GCS tools for model upload
RUN pip install google-cloud-storage
# Install Unsloth for 2x faster training
# Pinned to cu124-torch240 variant for CUDA 12.4 + PyTorch 2.4.0
RUN pip install "unsloth[cu124-torch240] @ git+https://github.com/unslothai/unsloth.git"
# Install vLLM for post-training inference smoke tests
RUN pip install "vllm>=0.5.0"
# Install benchmarking dependencies
RUN pip install aiohttp
# ─── Stage 2: Runtime image ──────────────────────────────────
FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1
# Install minimal runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
curl \
&& rm -rf /var/lib/apt/lists/*
# Set Python 3.11 as default
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:${PATH}"
# Install gsutil (lightweight, needed for model upload)
RUN curl -sSL https://sdk.cloud.google.com | bash -s -- --disable-prompts \
&& echo 'source /root/google-cloud-sdk/path.bash.inc' >> /etc/bash.bashrc
ENV PATH="/root/google-cloud-sdk/bin:${PATH}"
# Create non-root user for runtime (but allow root for GPU access if needed)
RUN groupadd -r trainer && useradd -r -g trainer -m trainer
# Set working directory
WORKDIR /training
# Copy training scripts
COPY scripts/training/ ./scripts/
# Copy training data (if available at build time)
# In production, data is usually mounted as a volume
COPY data/training/ ./data/ 2>/dev/null || true
# Ensure model output directory exists and is writable
RUN mkdir -p /models && chown trainer:trainer /models
# Healthcheck: verify Python and torch are importable
HEALTHCHECK --interval=60s --timeout=10s --start-period=30s --retries=3 \
CMD python3 -c "import torch; assert torch.cuda.is_available(), 'No GPU'" || exit 1
# Metadata labels
LABEL org.opencontainers.image.title="crypto-vision-trainer" \
org.opencontainers.image.description="GPU training container for fine-tuning crypto-specific LLMs" \
org.opencontainers.image.source="https://github.com/nirholas/crypto-vision" \
org.opencontainers.image.authors="nirholas"
# Default entrypoint: Python 3.11
ENTRYPOINT ["python3"]
# Default command: show help
CMD ["scripts/opensource/train.py", "--help"]