Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better GPU memory monitoring, and works with more GPUs #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions check_mk/nvidia_smi/agent_based/nvidia_smi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from .agent_based_api.v1 import *
import pprint


def discover_nvidia(section):
for sector, used, slots in section:
yield Service(item=sector)


def check_nvidia(item, section):
for sector, used, slots in section:
if sector == item:
used = int(used) # convert string to int
slots = int(slots) # convert string to int
if used == slots:
s = State.CRIT
elif slots - used <= 10:
s = State.WARN
else:
s = State.OK
yield Result(
state = s,
summary = f"used {used} of {slots}")
yield Metric('gpumem', used, levels=(90, None), boundaries=(0,100))
return


register.check_plugin(
name = "nvidia_gpu",
service_name = "GPU %s",
discovery_function = discover_nvidia,
check_function = check_nvidia,
)
137 changes: 101 additions & 36 deletions check_mk/nvidia_smi/agents/plugins/nvidia_smi
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,59 @@ nvidiaSMI = xml.dom.minidom.parseString(Popen(["nvidia-smi", "-q", "-x"], stdout

for gpu in nvidiaSMI.getElementsByTagName('gpu'):
id_gpu = gpu.getAttribute('id')
gpu_utilization = int(gpu
.getElementsByTagName("utilization")[0]
.getElementsByTagName("gpu_util")[0]
.childNodes[0]
.data.split()[0])

gpu_mem_usage = int(gpu
.getElementsByTagName("utilization")[0]
.getElementsByTagName("memory_util")[0]
.childNodes[0]
.data.split()[0])
# Make sure assumptions about units match reality
unit = (gpu
.getElementsByTagName("fb_memory_usage")[0]
.getElementsByTagName("used")[0]
.childNodes[0].data.split()[1])
if unit != 'MiB':
raise AssertionError('Value of fb_memory_usage_used: ' + unit)
unit = (gpu
.getElementsByTagName("fb_memory_usage")[0]
.getElementsByTagName("total")[0]
.childNodes[0].data.split()[1])
if unit != 'MiB':
raise AssertionError('Value of fb_memory_usage_total: ' + unit)

try:
gpu_utilization = int(gpu
.getElementsByTagName("utilization")[0]
.getElementsByTagName("gpu_util")[0]
.childNodes[0]
.data.split()[0])
except ValueError:
gpu_utilization = None


try:
gpu_mem_util = int(gpu
.getElementsByTagName("utilization")[0]
.getElementsByTagName("memory_util")[0]
.childNodes[0]
.data.split()[0])
except ValueError:
gpu_mem_util = None


try:
gpu_fb_memory_usage_used = int(gpu
.getElementsByTagName("fb_memory_usage")[0]
.getElementsByTagName("used")[0]
.childNodes[0]
.data.split()[0]) * 1024*1024
except ValueError:
gpu_fb_memory_usage_used = None

try:
gpu_fb_memory_usage_total = int(gpu
.getElementsByTagName("fb_memory_usage")[0]
.getElementsByTagName("total")[0]
.childNodes[0]
.data.split()[0]) * 1024*1024
except ValueError:
gpu_fb_memory_usage_total = None


try:
gpu_temperature = int(gpu
Expand All @@ -28,31 +70,54 @@ for gpu in nvidiaSMI.getElementsByTagName('gpu'):
.childNodes[0]
.data.split()[0])
except ValueError:
gpu_temperature: int = 0

gpu_sm_clock = int(gpu
.getElementsByTagName("clocks")[0]
.getElementsByTagName("sm_clock")[0]
.childNodes[0]
.data.split()[0])

gpu_graphics_clock = int(gpu
.getElementsByTagName("clocks")[0]
.getElementsByTagName("graphics_clock")[0]
.childNodes[0]
.data.split()[0])

gpu_mem_clock = int(gpu
.getElementsByTagName("clocks")[0]
.getElementsByTagName("mem_clock")[0]
.childNodes[0]
.data.split()[0])

print(f"smi nvidia gpu_utilization {gpu_utilization}")
print(f"smi nvidia memory_used {gpu_mem_usage}")
print(f"smi nvidia temperature {gpu_temperature}")
print(f"smi nvidia graphics_clock {gpu_graphics_clock}")
print(f"smi nvidia sm_clock {gpu_sm_clock}")
print(f"smi nvidia msm_clock {gpu_mem_clock}")
gpu_temperature = None


try:
gpu_sm_clock = int(gpu
.getElementsByTagName("clocks")[0]
.getElementsByTagName("sm_clock")[0]
.childNodes[0]
.data.split()[0])
except ValueError:
gpu_sm_clock = None


try:
gpu_graphics_clock = int(gpu
.getElementsByTagName("clocks")[0]
.getElementsByTagName("graphics_clock")[0]
.childNodes[0]
.data.split()[0])
except ValueError:
gpu_graphics_clock: int = 0


try:
gpu_mem_clock = int(gpu
.getElementsByTagName("clocks")[0]
.getElementsByTagName("mem_clock")[0]
.childNodes[0]
.data.split()[0])
except ValueError:
gpu_mem_clock = None


if gpu_utilization is not None or \
gpu_fb_memory_usage_total is not None or \
gpu_fb_memory_usage_used is not None or \
gpu_mem_util is not None or \
gpu_temperature is not None or \
gpu_graphics_clock is not None or \
gpu_sm_clock is not None or \
gpu_mem_clock is not None:
print(f"smi nvidia gpu_utilization {gpu_utilization}")
print(f"smi nvidia memory_util {gpu_mem_util}")
print(f"smi nvidia gpu_fb_memory_usage_total {gpu_fb_memory_usage_total}")
print(f"smi nvidia gpu_fb_memory_usage_used {gpu_fb_memory_usage_used}")
print(f"smi nvidia temperature {gpu_temperature}")
print(f"smi nvidia graphics_clock {gpu_graphics_clock}")
print(f"smi nvidia sm_clock {gpu_sm_clock}")
print(f"smi nvidia msm_clock {gpu_mem_clock}")

i += 1
4 changes: 2 additions & 2 deletions check_mk/nvidia_smi/checkman/nvidia_smi
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ catalog: app/
license: GPL
distribution: check_mk
description:
Blah Blah
Status of NVIDIA GPU, based on output from nvidia-smi.
inventory:
One service per configured nvidia will be created.

item:
Blah Blah
Status of GPU.
60 changes: 44 additions & 16 deletions check_mk/nvidia_smi/checks/nvidia_smi.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def nvidia_smi_parse(info):
if len(line) != 4:
continue # Skip unexpected lines
pool_name, pm_type, metric, value = line
if value == 'None':
continue
item = '%s [%s]' % (pool_name, pm_type)
if item not in data:
data[item] = {}
Expand Down Expand Up @@ -58,33 +60,59 @@ def check_nvidia_smi(item, params, info):
data = all_data[item]

perfkeys = [
'gpu_utilization', 'memory_used', 'temperature',
'gpu_utilization', 'memory_util', 'temperature',
'graphics_clock', 'msm_clock', 'sm_clock',
'gpu_fb_memory_usage_used'
]
# Add some more values, derived from the raw ones...
this_time = int(time.time())
# for key in ['accepted_conn', 'max_children_reached', 'slow_requests']:
# per_sec = get_rate("nginx_status.%s" % key, this_time, data[key])
# data['%s_per_sec' % key] = per_sec
# perfkeys.append('%s_per_sec' % key)

perfdata = []
for i, key in enumerate(perfkeys):
perfdata.append( (key, data[key]) )
try:
reading = data[key]
perfdata.append( (key, data[key]) )
except KeyError:
pass
perfdata.sort()

worst_state = 0

proc_warn, proc_crit = params.get('gpu_utilization', (None, None))
proc_txt = ''
if proc_crit is not None and data['gpu_utilization'] > proc_crit:
worst_state = max(worst_state, 2)
proc_txt = ' (!!)'
elif proc_warn is not None and data['gpu_utilization'] > proc_warn:
worst_state = max(worst_state, 1)
proc_txt = ' (!)'

output = [
'GPU util: %d%s memory used: %d, Temperature %d' % (
data['gpu_utilization'], proc_txt, data['memory_used'], data['temperature'],
),
]
try:
proc_warn, proc_crit = params.get('gpu_utilization', (None, None))
proc_txt = ''
if proc_crit is not None and data['gpu_utilization'] > proc_crit:
worst_state = max(worst_state, 2)
proc_txt = ' (!!)'
elif proc_warn is not None and data['gpu_utilization'] > proc_warn:
worst_state = max(worst_state, 1)
proc_txt = ' (!)'
except KeyError:
worst_state = 0
proc_txt = ''

# output = [
# 'Active: %d%s (%d idle, %d waiting)' % (
# data['gpu_utilization'], proc_txt, data['gpu_fb_memory_usage_used'], data['temperature'],
# ),
# 'Started %s ago' % (get_age_human_readable(data['graphics_clock'])),
# 'Requests: %0.2f/s' % (data['msm_clock']),
# ]

if 'gpu_utilization' in data and 'gpu_fb_memory_usage_used' in data and 'temperature' in data:
mem_used_MiB = data['gpu_fb_memory_usage_used'] / (1024*1024)
mem_total_MiB = data['gpu_fb_memory_usage_total'] / (1024*1024)
output = [
'GPU util: %d%%%s, memory used: %d MiB of %d MiB, temperature %dC' % (
data['gpu_utilization'], proc_txt, mem_used_MiB, mem_total_MiB, data['temperature'],
),
]
else:
output = [ 'Too few data from nvidia-smi, assuming GPU is OK', ]

return worst_state, ', '.join(output), perfdata

Expand Down
23 changes: 23 additions & 0 deletions check_mk/nvidia_smi/mkp_packaging/nvidia_smi
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{'author': 'Anton Zhelyazkov',
'description': 'Determine NVIDIA GPU status based on nvidia-smi output',
'download_url': 'https://github.com/antonzhelyazkov/nagiosScripts/tree/master/check_mk/nvidia_smi',
'files': {'agent_based': ['nvidia_smi.py'],
'agents': ['plugins/nvidia_smi'],
'alert_handlers': [],
'bin': [],
'checkman': ['nvidia_smi'],
'checks': ['nvidia_smi'],
'doc': [],
'inventory': [],
'lib': [],
'locales': [],
'mibs': [],
'notifications': [],
'pnp-templates': [],
'web': ['plugins/metrics/nvidia_smi.py']},
'name': 'nvidia_smi',
'title': 'GPU check based on nvidia_smi',
'version': '1.0',
'version.min_required': '2.0.0',
'version.packaged': '2.0.0',
'version.usable_until': None}
23 changes: 18 additions & 5 deletions check_mk/nvidia_smi/web/plugins/metrics/nvidia_smi.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,41 @@
})

metric_info["temperature"] = {
"title" : _("Temperature"),
"title" : _("Temperature(C)"),
"unit" : "",
"color" : "41/b",
}

graph_info.append({
"title" : _("Temperature"),
"title" : _("Temperature(C)"),
"metrics" : [
( "temperature", "line" ),
],
})

metric_info["memory_used"] = {
"title" : _("Memory used"),
metric_info["memory_util"] = {
"title" : _("Memory utilization"),
"unit" : "%",
"color" : "21/b",
}

graph_info.append({
"title" : _("Memory utilization"),
"metrics" : [
( "memory_util", "line" ),
],
})

metric_info["gpu_fb_memory_usage_used"] = {
"title" : _("Memory used"),
"unit" : "bytes",
"color" : "21/b",
}

graph_info.append({
"title" : _("Memory used"),
"metrics" : [
( "memory_used", "line" ),
( "gpu_fb_memory_usage_used", "line" ),
],
})

Expand Down