Skip to content

Commit b5d1e8a

Browse files
committed
Add deployment for nvidia-mig-manager.service through systemd
The current implementation has been built on a DGX-A100 system (running the latest DGX OS -- a variant of Ubuntu 20.04). It is therefore customized to the set of services and components expected to be running on this OS. The file 'utils-custom.sh' can be used to customize this for other deployments without changing the core logic of the service and its constituent components. Signed-off-by: Kevin Klues <[email protected]>
1 parent 040d021 commit b5d1e8a

9 files changed

+605
-0
lines changed

deployments/systemd/apply-config.sh

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env bash
2+
3+
CURRDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4+
5+
source ${CURRDIR}/utils.sh
6+
7+
if [ "$#" != "1" ]; then
8+
(set +x; echo "Requires exactly one argument with the name of the desired MIG config")
9+
exit 1
10+
fi
11+
12+
: "${config_file:=${CURRDIR}/config.yaml}"
13+
: "${selected_config:=${1}}"
14+
15+
set -x
16+
17+
nvidia-mig-manager::service::persist_config_across_reboot "${selected_config}"
18+
if [ "${?}" != "0" ]; then
19+
(set +x; echo "Error persisting config across reboots")
20+
exit 1
21+
fi
22+
nvidia-mig-manager::service::apply_mode "${config_file}" "${selected_config}"
23+
if [ "$?" != 0 ]; then
24+
(set +x; echo "Error applying MIG mode")
25+
exit 1
26+
fi
27+
nvidia-mig-manager::service::apply_config "${config_file}" "${selected_config}"
28+
if [ "$?" != 0 ]; then
29+
(set +x; echo "Error applying MIG config")
30+
exit 1
31+
fi
32+
nvidia-mig-parted export

deployments/systemd/config.yaml

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
version: v1
2+
mig-configs:
3+
all-disabled:
4+
- devices: all
5+
mig-enabled: false
6+
7+
all-1g.5gb:
8+
- devices: all
9+
mig-enabled: true
10+
mig-devices:
11+
"1g.5gb": 7
12+
13+
all-2g.10gb:
14+
- devices: all
15+
mig-enabled: true
16+
mig-devices:
17+
"2g.10gb": 3
18+
19+
all-3g.20gb:
20+
- devices: all
21+
mig-enabled: true
22+
mig-devices:
23+
"3g.20gb": 2
24+
25+
all-balanced:
26+
- devices: all
27+
mig-enabled: true
28+
mig-devices:
29+
"1g.5gb": 2
30+
"2g.10gb": 1
31+
"3g.20gb": 1

deployments/systemd/install.sh

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env bash
2+
3+
SERVICE_ROOT="nvidia-mig-manager"
4+
SERVICE_NAME="${SERVICE_ROOT}.service"
5+
6+
MIG_PARTED_NAME="nvidia-mig-parted"
7+
MIG_PARTED_GO_GET_PATH="github.com/NVIDIA/mig-parted/cmd/${MIG_PARTED_NAME}"
8+
9+
BINARY_DIR="/usr/bin/"
10+
SYSTEMD_DIR="/usr/lib/systemd/system"
11+
DATA_DIR="/var/lib/${SERVICE_ROOT}"
12+
CONFIG_DIR="/etc/${SERVICE_ROOT}"
13+
OVERRIDE_DIR="/etc/systemd/system/${SERVICE_NAME}.d"
14+
15+
mkdir -p ${BINARY_DIR}
16+
mkdir -p ${SYSTEMD_DIR}
17+
mkdir -p ${DATA_DIR}
18+
mkdir -p ${CONFIG_DIR}
19+
mkdir -p ${OVERRIDE_DIR}
20+
21+
GO111MODULE=off go get -u ${MIG_PARTED_GO_GET_PATH}
22+
GOBIN=${BINARY_DIR} go install ${MIG_PARTED_GO_GET_PATH}
23+
24+
cp ${SERVICE_NAME} ${SYSTEMD_DIR}
25+
cp override.conf ${OVERRIDE_DIR}
26+
cp service.sh ${CONFIG_DIR}
27+
cp utils.sh ${CONFIG_DIR}
28+
cp utils-custom.sh ${CONFIG_DIR}
29+
cp apply-config.sh ${CONFIG_DIR}
30+
cp config.yaml ${CONFIG_DIR}
31+
32+
systemctl daemon-reload
33+
systemctl enable ${SERVICE_NAME}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Unit]
2+
Description=Configure MIG on NVIDIA GPUs
3+
DefaultDependencies=no
4+
After=sysinit.target local-fs.target
5+
Before=basic.target nvidia-persistenced.service
6+
7+
[Service]
8+
Type=oneshot
9+
ExecStart=-/bin/bash /etc/nvidia-mig-manager/service.sh
10+
11+
[Install]
12+
WantedBy=basic.target

deployments/systemd/override.conf

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[Service]
2+
Environment="MIG_PARTED_SELECTED_CONFIG=all-disabled"

deployments/systemd/service.sh

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env bash
2+
3+
CURRDIR="$(cd "$( dirname $(readlink -f "${BASH_SOURCE[0]}"))" >/dev/null 2>&1 && pwd)"
4+
5+
source ${CURRDIR}/utils.sh
6+
7+
: "${MIG_PARTED_CONFIG_FILE:=${CURRDIR}/config.yaml}"
8+
: "${MIG_PARTED_SELECTED_CONFIG:?Environment variable must be set before calling this script}"
9+
10+
export MIG_PARTED_CONFIG_FILE
11+
export MIG_PARTED_SELECTED_CONFIG
12+
13+
set -x
14+
15+
# Check if the desired MIG mode is already applied
16+
nvidia-mig-parted assert --mode-only
17+
18+
# If it is not, then go through the process of applying it
19+
if [ "${?}" != 0 ]; then
20+
# Apply MIG mode, without issuing a GPU reset
21+
nvidia-mig-parted apply --mode-only --skip-reset
22+
if [ "${?}" != 0 ]; then
23+
(set +x; echo "Error applying MIG mode")
24+
exit 1
25+
fi
26+
27+
# If GPU reset is not available (e.g. GPU passthrough virtualization),
28+
# then issue a reboot. The reboot will only occur once. If the MIG mode is
29+
# still not applied after reboot, this script will error out.
30+
nvidia-mig-manager::service::assert_gpu_reset_available
31+
if [ "${?}" != 0 ]; then
32+
(set +x;
33+
echo "GPU reset capabilities are not available"
34+
echo "Attempting reboot")
35+
nvidia-mig-manager::service::reboot
36+
exit "${?}"
37+
fi
38+
39+
# Since the desired MIG mode is already applied, the
40+
# following will just do a GPU reset under the hood
41+
nvidia-mig-parted apply --mode-only
42+
if [ "${?}" != 0 ]; then
43+
(set +x; echo "Error issuing GPU reset")
44+
exit 1
45+
fi
46+
fi
47+
48+
# In case a reboot was issued by a previous iteration of this script, we clear
49+
# the reboot state so that the next next MIG mode change + reboot will succeed.
50+
nvidia-mig-manager::service::clear_reboot_state
51+
52+
nvidia-mig-manager::service::assert_module_loaded "nvidia"
53+
if [ "${?}" != 0 ]; then
54+
(set +x; echo "No nvidia module loaded, skipping MIG device config")
55+
exit 0
56+
fi
57+
58+
nvidia-mig-parted apply
59+
if [ "${?}" != 0 ]; then
60+
(set +x; echo "Error applying MIG config")
61+
exit 1
62+
fi
63+
64+
nvidia-mig-parted export

deployments/systemd/uninstall.sh

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env bash
2+
3+
SERVICE_ROOT="nvidia-mig-manager"
4+
SERVICE_NAME="${SERVICE_ROOT}.service"
5+
6+
MIG_PARTED_NAME="nvidia-mig-parted"
7+
MIG_PARTED_GO_GET_PATH="github.com/NVIDIA/mig-parted/cmd/${MIG_PARTED_NAME}"
8+
9+
BINARY_DIR="/usr/bin/"
10+
SYSTEMD_DIR="/usr/lib/systemd/system"
11+
DATA_DIR="/var/lib/${SERVICE_ROOT}"
12+
CONFIG_DIR="/etc/${SERVICE_ROOT}"
13+
OVERRIDE_DIR="/etc/systemd/system/${SERVICE_NAME}.d"
14+
15+
systemctl disable ${SERVICE_NAME}
16+
systemctl daemon-reload
17+
18+
rm -rf ${DATA_DIR}
19+
rm -rf ${CONFIG_DIR}
20+
rm -rf ${OVERRIDE_DIR}
21+
22+
rm ${BINARY_DIR}/${MIG_PARTED_NAME}
23+
rm ${SYSTEMD_DIR}/${SERVICE_NAME}

deployments/systemd/utils-custom.sh

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/usr/bin/env bash
2+
3+
function nvidia-mig-manager::service::pre_apply_mode() {
4+
nvidia-mig-manager::service::stop_k8s_components
5+
if [ "${?}" != "0" ]; then
6+
return 1
7+
fi
8+
nvidia-mig-manager::service::stop_driver_services
9+
if [ "${?}" != "0" ]; then
10+
return 1
11+
fi
12+
nvidia-mig-manager::service::remove_driver_modules
13+
if [ "${?}" != "0" ]; then
14+
return 1
15+
fi
16+
return 0
17+
}
18+
19+
function nvidia-mig-manager::service::post_apply_mode() {
20+
nvidia-mig-manager::service::insert_driver_modules
21+
if [ "${?}" != "0" ]; then
22+
return 1
23+
fi
24+
nvidia-mig-manager::service::start_driver_services
25+
if [ "${?}" != "0" ]; then
26+
return 1
27+
fi
28+
nvidia-mig-manager::service::start_k8s_components
29+
if [ "${?}" != "0" ]; then
30+
return 1
31+
fi
32+
return 0
33+
}
34+
35+
function nvidia-mig-manager::service::pre_apply_config() {
36+
nvidia-mig-manager::service::stop_k8s_components
37+
return ${?}
38+
}
39+
40+
function nvidia-mig-manager::service::post_apply_config() {
41+
nvidia-mig-manager::service::start_k8s_components
42+
return ${?}
43+
}
44+
45+
function nvidia-mig-manager::service::remove_driver_modules() {
46+
local modules=(
47+
nvidia_uvm
48+
nvidia_drm
49+
nvidia_modeset
50+
nvidia
51+
)
52+
nvidia-mig-manager::service::remove_modules modules
53+
return ${?}
54+
}
55+
56+
function nvidia-mig-manager::service::insert_driver_modules() {
57+
local modules=(
58+
nvidia
59+
nvidia_modeset
60+
nvidia_drm
61+
nvidia_uvm
62+
)
63+
nvidia-mig-manager::service::insert_modules modules
64+
return ${?}
65+
}
66+
67+
function nvidia-mig-manager::service::stop_driver_services() {
68+
local services=(
69+
dcgm.service
70+
nv_peer_mem.service
71+
nvsm-notifier.service
72+
nvsm-api-gateway.service
73+
nvsm-core.service
74+
nvsm-mqtt.service
75+
nvsm.service
76+
nvidia-fabricmanager.service
77+
nvidia-persistenced.service
78+
)
79+
nvidia-mig-manager::service::stop_systemd_services services
80+
return ${?}
81+
}
82+
83+
function nvidia-mig-manager::service::start_driver_services() {
84+
local services=(
85+
nvidia-persistenced.service
86+
nvidia-fabricmanager.service
87+
nvsm.service
88+
nvsm-mqtt.service
89+
nvsm-core.service
90+
nvsm-api-gateway.service
91+
nvsm-notifier.service
92+
nv_peer_mem.service
93+
dcgm.service
94+
)
95+
nvidia-mig-manager::service::start_systemd_services services
96+
return ${?}
97+
}
98+
99+
function nvidia-mig-manager::service::stop_k8s_components() {
100+
local services=(
101+
kubelet.service
102+
dcgm-exporter.service
103+
)
104+
nvidia-mig-manager::service::stop_systemd_services services
105+
if [ "${?}" != "0" ]; then
106+
return 1
107+
fi
108+
local container_images=(
109+
k8s-device-plugin
110+
gpu-feature-discovery
111+
)
112+
nvidia-mig-manager::service::kill_k8s_containers_via_runtime_by_image container_images
113+
if [ "${?}" != "0" ]; then
114+
return 1
115+
fi
116+
return 0
117+
}
118+
119+
function nvidia-mig-manager::service::start_k8s_components() {
120+
local services=(
121+
dcgm-exporter.service
122+
kubelet.service
123+
)
124+
nvidia-mig-manager::service::start_systemd_services services
125+
return ${?}
126+
}

0 commit comments

Comments
 (0)