forked from NVIDIA/gpu-driver-container
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocp_dtk_entrypoint
executable file
·294 lines (233 loc) · 10.1 KB
/
ocp_dtk_entrypoint
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env bash
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
set -eu
DRIVER_TOOLKIT_SHARED_DIR=/mnt/shared-nvidia-driver-toolkit
echo "Running $*"
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source $SCRIPT_DIR/common.sh
nv-ctr-run-with-dtk() {
set -x
if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then
echo "WARNING: RHCOS '${RHCOS_VERSION:-}' imagetag missing, using entitlement-based fallback"
exec bash -x nvidia-driver init
fi
if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
cp -r \
/tmp/install.sh \
/usr/local/bin/ocp_dtk_entrypoint \
/usr/local/bin/nvidia-driver \
/usr/local/bin/common.sh \
/usr/local/bin/extract-vmlinux \
/usr/local/bin/vgpu-util \
/drivers \
/licenses \
"$DRIVER_TOOLKIT_SHARED_DIR/"
env | sed 's/=/="/' | sed 's/$/"/' > "$DRIVER_TOOLKIT_SHARED_DIR/env"
touch "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared"
fi
set +x
while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started" ]]; do
if [[ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken" ]]; then
echo "WARNING: broken driver toolkit detected, using entitlement-based fallback"
exec bash -x nvidia-driver init
fi
echo "$(date) Waiting for openshift-driver-toolkit-ctr container to start ..."
sleep 15
done
echo "$(date) openshift-driver-toolkit-ctr started."
while [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]]; do
echo "$(date) Waiting for openshift-driver-toolkit-ctr container to build the precompiled driver ..."
sleep 15
done
set -x
MODULES_SHARED=${DRIVER_TOOLKIT_SHARED_DIR}/modules/
# Copy the modules to their standard location
MODULES_LOCAL="/lib/modules/$(uname -r)"
mkdir -p "${MODULES_LOCAL}"
cp -rv "${MODULES_SHARED}"/* "${MODULES_LOCAL}"
# Tell SELinux to allow loading these files
find . -type f \
\( -name "*.txt" -or -name "*.go" \) \
-exec chcon -t modules_object_t "{}" \;
echo "#"
echo "# Executing nvidia-driver load script ..."
echo "#"
exec bash -x nvidia-driver load
}
dtk-build-driver() {
if [[ "${RHCOS_IMAGE_MISSING:-}" == "true" ]]; then
echo "WARNING: 'istag/driver-toolkit:${RHCOS_VERSION} -n openshift' missing, nothing to do in openshift-driver-toolkit-ctr container"
sleep inf
fi
if ! [[ -f "/lib/modules/$(uname -r)/vmlinuz" ]]; then
echo "WARNING: broken Driver Toolkit image detected:"
echo "- Node kernel: $(uname -r)"
echo "- Kernel package: $(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core)"
echo "INFO: informing nvidia-driver-ctr to fallback on entitled-build."
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_toolkit_broken"
echo "INFO: nothing else to do in openshift-driver-toolkit-ctr container, sleeping forever."
sleep inf
fi
# Shared directory is prepared before entering this script. See
# 'until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ] ...'
# in the Pod command/args
touch "$DRIVER_TOOLKIT_SHARED_DIR/driver_build_started"
if [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; then
echo "NVIDIA drivers already generated, nothing to do ..."
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
sleep 30
done
echo "WARNING: driver_built flag disappeared, rebuilding the drivers ..."
else
echo "Start building nvidia.ko driver ..."
fi
set -x
set -o allexport
source "${DRIVER_TOOLKIT_SHARED_DIR}/env"
set +o allexport;
DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
# If this directory already exists,
# NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run fails to run
# and doesn't create its files. This may happen when the
# container fails and restart its execution, leading to
# hard-to-understand "unrelated" errors in the following of the script execution
rm -rf "${DRIVER_TOOLKIT_SHARED_DIR}/drivers/NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}";
# elfutils-libelf-devel.x86_64 is already install in the DTK and enough
sed 's/elfutils-libelf.x86_64//' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver"
# Install script assumes these directories can be deleted->recreated,
# but recreation doesn't happen in the DTK
sed 's|rm -rf /lib/modules/${KERNEL_VERSION}/video||' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver"
sed 's|rm -rf /lib/modules/${KERNEL_VERSION}||' -i "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver"
mkdir "${DRIVER_TOOLKIT_SHARED_DIR}/bin" -p
cp -v \
"$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \
"$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \
"$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \
"$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" \
"${DRIVER_TOOLKIT_SHARED_DIR}/bin"
ln -s $(which true) ${DRIVER_TOOLKIT_SHARED_DIR}/bin/dnf --force
export PATH="${DRIVER_TOOLKIT_SHARED_DIR}/bin:$PATH";
# Install.sh script is mandatory
cp "${DRIVER_TOOLKIT_SHARED_DIR}/install.sh" /tmp/
cd "${DRIVER_TOOLKIT_SHARED_DIR}/drivers";
echo "#"
echo "# Executing nvidia-driver build script ..."
echo "#"
bash -x "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia-driver" build --tag builtin
echo "#"
echo "# nvidia-driver build script completed."
echo "#"
drivers=$(ls /lib/modules/"$(uname -r)"/kernel/drivers/video/nvidia*.ko)
if ! ls ${drivers} 2>/dev/null; then
echo "FATAL: no NVIDIA driver generated ..."
exit 1
fi
if _gpu_direct_storage_enabled; then
echo "#"
echo "# Executing nvidia-fs driver build."
echo "#"
# The dkms package is not supplied or supported by Red Hat.
# DKMS packages for RHEL are available in the third-party EPEL (Extra Packages for Enterprise Linux) repository.
# see https://access.redhat.com/solutions/1132653
dnf install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
dnf config-manager --enable epel
dnf install -y dkms redhat-lsb-core kmod binutils net-tools iputils libudev-devel libnl3-devel udev openssl-devel userspace-rcu libmount
dnf group install -y "Development Tools"
# Make nvidia driver sources accessible for building nvidia-fs
mkdir -p /lib/modules/${KERNEL_VERSION}/updates
ln -s /run/nvidia/driver/lib/modules/${KERNEL_VERSION}/kernel/drivers/video/ /lib/modules/${KERNEL_VERSION}/updates/dkms
# Install nvidia-fs
make -C $DRIVER_TOOLKIT_SHARED_DIR/gds-nvidia-fs/src
echo "#"
echo "# nvidia-fs build script completed."
echo "#"
fi
if _gdrcopy_enabled; then
echo "#"
echo "# Executing gdrcopy driver build."
echo "#"
# Make nvidia driver sources accessible for building nvidia-fs
nvidia_src_dir=$(find /usr/src/nvidia-* -name "nv-p2p.c" -print -quit | xargs dirname || echo "NVIDIA_DRIVER_MISSING" 2>/dev/null)
if [ "nvidia_src_dir" = "NVIDIA_DRIVER_MISSING" ]; then
echo "Failed to find NVIDIA driver source, exiting."
return 1
fi
export NVIDIA_SRC_DIR=$nvidia_src_dir
# Build gdrdrv kernel module
make -C $DRIVER_TOOLKIT_SHARED_DIR/gdrcopy driver
echo "#"
echo "# gdrcopy build script completed."
echo "#"
fi
MODULES_SHARED="${DRIVER_TOOLKIT_SHARED_DIR}/modules"
mkdir -p "${MODULES_SHARED}"
# Prepare the list of modules required by NVIDIA
modprobe -a i2c_core ipmi_msghandler ipmi_devintf --show-depends > ${MODULES_SHARED}/insmod_nvidia
modprobe -a nvidia nvidia-uvm nvidia-modeset --show-depends >> ${MODULES_SHARED}/insmod_nvidia
if _gpu_direct_rdma_enabled; then
modprobe -a nvidia-peermem --show-depends >> ${MODULES_SHARED}/insmod_nvidia
fi
set +x
# Copy the modules to the shared directory
while read line; do
if [[ "$line" == "builtin "* ]]; then
#eg: line="builtin i2c_core"
continue
fi
# eg: line="insmod /lib/modules/4.18.0-305.10.2.el8_4.x86_64/kernel/drivers/gpu/drm/drm.ko.x"
modsrc=$(echo "${line}" | awk '{ print $2}')
moddir=$(dirname "$(echo "${modsrc}" | sed "s|/lib/modules/$(uname -r)/||")")
moddst="${MODULES_SHARED}/${moddir}"
mkdir -p "${moddst}"
cp -v "${modsrc}" "${moddst}"
done <<< $(cat "${MODULES_SHARED}/insmod_nvidia")
# Copies modules location and dependency files
cp /lib/modules/$(uname -r)/modules.* "${MODULES_SHARED}"
echo "NVIDIA drivers generated, inform nvidia-driver-ctr container about it and sleep forever."
touch "${DRIVER_TOOLKIT_SHARED_DIR}/driver_built"
if _gpu_direct_storage_enabled; then
echo "NVIDIA-FS drivers generated, inform nvidia-fs-driver-ctr container about it and sleep forever."
touch "${DRIVER_TOOLKIT_SHARED_DIR}/nvidia_fs_built"
fi
if _gdrcopy_enabled; then
echo "gdrcopy driver built, inform nvidia-gdrcopy-ctr container about it and sleep forever."
touch "${DRIVER_TOOLKIT_SHARED_DIR}/gdrcopy_built"
fi
while [ -f "$DRIVER_TOOLKIT_SHARED_DIR/driver_built" ]; do
sleep 30
done
echo "WARNING: driver_built flag disappeared, restart this container"
exit 0
}
usage() {
cat >&2 <<EOF
Usage: $0 COMMAND
Commands:
dtk-build-driver Build NVIDIA driver inside OCP-DTK
nv-ctr-run-with-dtk entrypoint for nvidia-driver-ctr container
EOF
exit 1
}
if [ $# -eq 0 ]; then
usage
fi
command=$1; shift
case "${command}" in
dtk-build-driver) options="" ;;
nv-ctr-run-with-dtk) options="" ;;
*) usage ;;
esac
if [ $? -ne 0 ]; then
usage
fi
eval set -- "${options}"
if ! [ -d "${DRIVER_TOOLKIT_SHARED_DIR:-}" ]; then
echo "FATAL: DRIVER_TOOLKIT_SHARED_DIR env variable must be populated with a valid directory"
usage
fi
KERNEL_VERSION=$(uname -r)
if [ $# -ne 0 ]; then
usage
fi
$command