diff --git a/check_mk/nvidia_smi/agent_based/nvidia_smi.py b/check_mk/nvidia_smi/agent_based/nvidia_smi.py new file mode 100644 index 0000000..cb63483 --- /dev/null +++ b/check_mk/nvidia_smi/agent_based/nvidia_smi.py @@ -0,0 +1,33 @@ +from .agent_based_api.v1 import * +import pprint + + +def discover_nvidia(section): + for sector, used, slots in section: + yield Service(item=sector) + + +def check_nvidia(item, section): + for sector, used, slots in section: + if sector == item: + used = int(used) # convert string to int + slots = int(slots) # convert string to int + if used == slots: + s = State.CRIT + elif slots - used <= 10: + s = State.WARN + else: + s = State.OK + yield Result( + state = s, + summary = f"used {used} of {slots}") + yield Metric('gpumem', used, levels=(90, None), boundaries=(0,100)) + return + + +register.check_plugin( + name = "nvidia_gpu", + service_name = "GPU %s", + discovery_function = discover_nvidia, + check_function = check_nvidia, +) diff --git a/check_mk/nvidia_smi/agents/plugins/nvidia_smi b/check_mk/nvidia_smi/agents/plugins/nvidia_smi old mode 100644 new mode 100755 index 3b077af..8842741 --- a/check_mk/nvidia_smi/agents/plugins/nvidia_smi +++ b/check_mk/nvidia_smi/agents/plugins/nvidia_smi @@ -9,17 +9,59 @@ nvidiaSMI = xml.dom.minidom.parseString(Popen(["nvidia-smi", "-q", "-x"], stdout for gpu in nvidiaSMI.getElementsByTagName('gpu'): id_gpu = gpu.getAttribute('id') - gpu_utilization = int(gpu - .getElementsByTagName("utilization")[0] - .getElementsByTagName("gpu_util")[0] - .childNodes[0] - .data.split()[0]) - gpu_mem_usage = int(gpu - .getElementsByTagName("utilization")[0] - .getElementsByTagName("memory_util")[0] - .childNodes[0] - .data.split()[0]) + # Make sure assumptions about units match reality + unit = (gpu + .getElementsByTagName("fb_memory_usage")[0] + .getElementsByTagName("used")[0] + .childNodes[0].data.split()[1]) + if unit != 'MiB': + raise AssertionError('Value of fb_memory_usage_used: ' + unit) + unit = (gpu + .getElementsByTagName("fb_memory_usage")[0] + .getElementsByTagName("total")[0] + .childNodes[0].data.split()[1]) + if unit != 'MiB': + raise AssertionError('Value of fb_memory_usage_total: ' + unit) + + try: + gpu_utilization = int(gpu + .getElementsByTagName("utilization")[0] + .getElementsByTagName("gpu_util")[0] + .childNodes[0] + .data.split()[0]) + except ValueError: + gpu_utilization = None + + + try: + gpu_mem_util = int(gpu + .getElementsByTagName("utilization")[0] + .getElementsByTagName("memory_util")[0] + .childNodes[0] + .data.split()[0]) + except ValueError: + gpu_mem_util = None + + + try: + gpu_fb_memory_usage_used = int(gpu + .getElementsByTagName("fb_memory_usage")[0] + .getElementsByTagName("used")[0] + .childNodes[0] + .data.split()[0]) * 1024*1024 + except ValueError: + gpu_fb_memory_usage_used = None + + try: + gpu_fb_memory_usage_total = int(gpu + .getElementsByTagName("fb_memory_usage")[0] + .getElementsByTagName("total")[0] + .childNodes[0] + .data.split()[0]) * 1024*1024 + except ValueError: + gpu_fb_memory_usage_total = None + try: gpu_temperature = int(gpu @@ -28,31 +70,54 @@ for gpu in nvidiaSMI.getElementsByTagName('gpu'): .childNodes[0] .data.split()[0]) except ValueError: - gpu_temperature: int = 0 - - gpu_sm_clock = int(gpu - .getElementsByTagName("clocks")[0] - .getElementsByTagName("sm_clock")[0] - .childNodes[0] - .data.split()[0]) - - gpu_graphics_clock = int(gpu - .getElementsByTagName("clocks")[0] - .getElementsByTagName("graphics_clock")[0] - .childNodes[0] - .data.split()[0]) - - gpu_mem_clock = int(gpu - .getElementsByTagName("clocks")[0] - .getElementsByTagName("mem_clock")[0] - .childNodes[0] - .data.split()[0]) - - print(f"smi nvidia gpu_utilization {gpu_utilization}") - print(f"smi nvidia memory_used {gpu_mem_usage}") - print(f"smi nvidia temperature {gpu_temperature}") - print(f"smi nvidia graphics_clock {gpu_graphics_clock}") - print(f"smi nvidia sm_clock {gpu_sm_clock}") - print(f"smi nvidia msm_clock {gpu_mem_clock}") + gpu_temperature = None + + + try: + gpu_sm_clock = int(gpu + .getElementsByTagName("clocks")[0] + .getElementsByTagName("sm_clock")[0] + .childNodes[0] + .data.split()[0]) + except ValueError: + gpu_sm_clock = None + + + try: + gpu_graphics_clock = int(gpu + .getElementsByTagName("clocks")[0] + .getElementsByTagName("graphics_clock")[0] + .childNodes[0] + .data.split()[0]) + except ValueError: + gpu_graphics_clock: int = 0 + + + try: + gpu_mem_clock = int(gpu + .getElementsByTagName("clocks")[0] + .getElementsByTagName("mem_clock")[0] + .childNodes[0] + .data.split()[0]) + except ValueError: + gpu_mem_clock = None + + + if gpu_utilization is not None or \ + gpu_fb_memory_usage_total is not None or \ + gpu_fb_memory_usage_used is not None or \ + gpu_mem_util is not None or \ + gpu_temperature is not None or \ + gpu_graphics_clock is not None or \ + gpu_sm_clock is not None or \ + gpu_mem_clock is not None: + print(f"smi nvidia gpu_utilization {gpu_utilization}") + print(f"smi nvidia memory_util {gpu_mem_util}") + print(f"smi nvidia gpu_fb_memory_usage_total {gpu_fb_memory_usage_total}") + print(f"smi nvidia gpu_fb_memory_usage_used {gpu_fb_memory_usage_used}") + print(f"smi nvidia temperature {gpu_temperature}") + print(f"smi nvidia graphics_clock {gpu_graphics_clock}") + print(f"smi nvidia sm_clock {gpu_sm_clock}") + print(f"smi nvidia msm_clock {gpu_mem_clock}") i += 1 diff --git a/check_mk/nvidia_smi/checkman/nvidia_smi b/check_mk/nvidia_smi/checkman/nvidia_smi index 3757657..fd2ac2a 100644 --- a/check_mk/nvidia_smi/checkman/nvidia_smi +++ b/check_mk/nvidia_smi/checkman/nvidia_smi @@ -4,9 +4,9 @@ catalog: app/ license: GPL distribution: check_mk description: - Blah Blah + Status of NVIDIA GPU, based on output from nvidia-smi. inventory: One service per configured nvidia will be created. item: - Blah Blah + Status of GPU. diff --git a/check_mk/nvidia_smi/checks/nvidia_smi.py b/check_mk/nvidia_smi/checks/nvidia_smi.py index c3f16b2..0ec7e18 100644 --- a/check_mk/nvidia_smi/checks/nvidia_smi.py +++ b/check_mk/nvidia_smi/checks/nvidia_smi.py @@ -31,6 +31,8 @@ def nvidia_smi_parse(info): if len(line) != 4: continue # Skip unexpected lines pool_name, pm_type, metric, value = line + if value == 'None': + continue item = '%s [%s]' % (pool_name, pm_type) if item not in data: data[item] = {} @@ -58,33 +60,59 @@ def check_nvidia_smi(item, params, info): data = all_data[item] perfkeys = [ - 'gpu_utilization', 'memory_used', 'temperature', + 'gpu_utilization', 'memory_util', 'temperature', 'graphics_clock', 'msm_clock', 'sm_clock', + 'gpu_fb_memory_usage_used' ] # Add some more values, derived from the raw ones... this_time = int(time.time()) + # for key in ['accepted_conn', 'max_children_reached', 'slow_requests']: + # per_sec = get_rate("nginx_status.%s" % key, this_time, data[key]) + # data['%s_per_sec' % key] = per_sec + # perfkeys.append('%s_per_sec' % key) perfdata = [] for i, key in enumerate(perfkeys): - perfdata.append( (key, data[key]) ) + try: + reading = data[key] + perfdata.append( (key, data[key]) ) + except KeyError: + pass perfdata.sort() worst_state = 0 - proc_warn, proc_crit = params.get('gpu_utilization', (None, None)) - proc_txt = '' - if proc_crit is not None and data['gpu_utilization'] > proc_crit: - worst_state = max(worst_state, 2) - proc_txt = ' (!!)' - elif proc_warn is not None and data['gpu_utilization'] > proc_warn: - worst_state = max(worst_state, 1) - proc_txt = ' (!)' - - output = [ - 'GPU util: %d%s memory used: %d, Temperature %d' % ( - data['gpu_utilization'], proc_txt, data['memory_used'], data['temperature'], - ), - ] + try: + proc_warn, proc_crit = params.get('gpu_utilization', (None, None)) + proc_txt = '' + if proc_crit is not None and data['gpu_utilization'] > proc_crit: + worst_state = max(worst_state, 2) + proc_txt = ' (!!)' + elif proc_warn is not None and data['gpu_utilization'] > proc_warn: + worst_state = max(worst_state, 1) + proc_txt = ' (!)' + except KeyError: + worst_state = 0 + proc_txt = '' + + # output = [ + # 'Active: %d%s (%d idle, %d waiting)' % ( + # data['gpu_utilization'], proc_txt, data['gpu_fb_memory_usage_used'], data['temperature'], + # ), + # 'Started %s ago' % (get_age_human_readable(data['graphics_clock'])), + # 'Requests: %0.2f/s' % (data['msm_clock']), + # ] + + if 'gpu_utilization' in data and 'gpu_fb_memory_usage_used' in data and 'temperature' in data: + mem_used_MiB = data['gpu_fb_memory_usage_used'] / (1024*1024) + mem_total_MiB = data['gpu_fb_memory_usage_total'] / (1024*1024) + output = [ + 'GPU util: %d%%%s, memory used: %d MiB of %d MiB, temperature %dC' % ( + data['gpu_utilization'], proc_txt, mem_used_MiB, mem_total_MiB, data['temperature'], + ), + ] + else: + output = [ 'Too few data from nvidia-smi, assuming GPU is OK', ] return worst_state, ', '.join(output), perfdata diff --git a/check_mk/nvidia_smi/mkp_packaging/nvidia_smi b/check_mk/nvidia_smi/mkp_packaging/nvidia_smi new file mode 100644 index 0000000..7e0fde9 --- /dev/null +++ b/check_mk/nvidia_smi/mkp_packaging/nvidia_smi @@ -0,0 +1,23 @@ +{'author': 'Anton Zhelyazkov', + 'description': 'Determine NVIDIA GPU status based on nvidia-smi output', + 'download_url': 'https://github.com/antonzhelyazkov/nagiosScripts/tree/master/check_mk/nvidia_smi', + 'files': {'agent_based': ['nvidia_smi.py'], + 'agents': ['plugins/nvidia_smi'], + 'alert_handlers': [], + 'bin': [], + 'checkman': ['nvidia_smi'], + 'checks': ['nvidia_smi'], + 'doc': [], + 'inventory': [], + 'lib': [], + 'locales': [], + 'mibs': [], + 'notifications': [], + 'pnp-templates': [], + 'web': ['plugins/metrics/nvidia_smi.py']}, + 'name': 'nvidia_smi', + 'title': 'GPU check based on nvidia_smi', + 'version': '1.0', + 'version.min_required': '2.0.0', + 'version.packaged': '2.0.0', + 'version.usable_until': None} diff --git a/check_mk/nvidia_smi/web/plugins/metrics/nvidia_smi.py b/check_mk/nvidia_smi/web/plugins/metrics/nvidia_smi.py index bd9e24b..ea2aa54 100644 --- a/check_mk/nvidia_smi/web/plugins/metrics/nvidia_smi.py +++ b/check_mk/nvidia_smi/web/plugins/metrics/nvidia_smi.py @@ -46,28 +46,41 @@ }) metric_info["temperature"] = { - "title" : _("Temperature"), + "title" : _("Temperature(C)"), "unit" : "", "color" : "41/b", } graph_info.append({ - "title" : _("Temperature"), + "title" : _("Temperature(C)"), "metrics" : [ ( "temperature", "line" ), ], }) -metric_info["memory_used"] = { - "title" : _("Memory used"), +metric_info["memory_util"] = { + "title" : _("Memory utilization"), "unit" : "%", "color" : "21/b", } +graph_info.append({ + "title" : _("Memory utilization"), + "metrics" : [ + ( "memory_util", "line" ), + ], +}) + +metric_info["gpu_fb_memory_usage_used"] = { + "title" : _("Memory used"), + "unit" : "bytes", + "color" : "21/b", +} + graph_info.append({ "title" : _("Memory used"), "metrics" : [ - ( "memory_used", "line" ), + ( "gpu_fb_memory_usage_used", "line" ), ], })