Skip to content

Commit dedc361

Browse files
authored
Merge pull request #55 from MoseleyBioinformaticsLab/amd
Separates the GPU querying into two classes, one for nvidia and another for amd, that inherit a common interface
2 parents d703c06 + a5fbb41 commit dedc361

File tree

4 files changed

+165
-56
lines changed

4 files changed

+165
-56
lines changed

src/gpu_tracker/__main__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
Usage:
55
gpu-tracker -h | --help
66
gpu-tracker -v | --version
7-
gpu-tracker --execute=<command> [--output=<output>] [--format=<format>] [--st=<sleep-time>] [--ru=<ram-unit>] [--gru=<gpu-ram-unit>] [--tu=<time-unit>] [--nec=<num-cores>] [--guuids=<gpu-uuids>] [--disable-logs]
7+
gpu-tracker --execute=<command> [--output=<output>] [--format=<format>] [--st=<sleep-time>] [--ru=<ram-unit>] [--gru=<gpu-ram-unit>] [--tu=<time-unit>] [--nec=<num-cores>] [--guuids=<gpu-uuids>] [--disable-logs] [--gb=<gpu-brand>]
88
99
Options:
1010
-h --help Show this help message and exit.
@@ -19,6 +19,7 @@
1919
--nec=<num-cores> The number of cores expected to be used. Defaults to the number of cores in the entire operating system.
2020
--guuids=<gpu-uuids> Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system.
2121
--disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual.
22+
--gb=<gpu-brand> The brand of GPU to profile. Valid values are nvidia and amd. Defaults to the brand of GPU detected in the system, checking NVIDIA first.
2223
"""
2324
import docopt as doc
2425
import subprocess as subp
@@ -41,7 +42,8 @@ def main():
4142
'--tu': 'time_unit',
4243
'--nec': 'n_expected_cores',
4344
'--guuids': 'gpu_uuids',
44-
'--disable-logs': 'disable_logs'
45+
'--disable-logs': 'disable_logs',
46+
'--gb': 'gpu_brand'
4547
}
4648
kwargs = {
4749
option_map[option]: value for option, value in args.items() if value is not None and option not in {

src/gpu_tracker/tracker.py

+109-37
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""The ``tracker`` module contains the ``Tracker`` class which can alternatively be imported directly from the ``gpu_tracker`` package."""
22
from __future__ import annotations
3+
import abc
34
import json
45
import dataclasses as dclass
56
import platform
@@ -17,6 +18,66 @@
1718
import pandas as pd
1819

1920

21+
class _GPUQuerier(abc.ABC):
22+
@classmethod
23+
def _query_gpu(cls, *args) -> pd.DataFrame:
24+
output = subp.check_output((cls.command,) + args, stderr=subp.STDOUT).decode()
25+
gpu_info = pd.read_csv(io.StringIO(output))
26+
return gpu_info.map(lambda value: value.strip() if type(value) is str else value)
27+
28+
@classmethod
29+
def is_available(cls) -> bool | None:
30+
try:
31+
subp.check_output(cls.command)
32+
return True
33+
except subp.CalledProcessError:
34+
return False
35+
except FileNotFoundError:
36+
return None
37+
38+
@classmethod
39+
@abc.abstractmethod
40+
def static_info(cls) -> pd.DataFrame:
41+
pass # pragma: nocover
42+
43+
@classmethod
44+
@abc.abstractmethod
45+
def process_ram(cls) -> pd.DataFrame:
46+
pass # pragma: nocover
47+
48+
@classmethod
49+
@abc.abstractmethod
50+
def ram_and_utilization(cls) -> pd.DataFrame:
51+
pass # pragma: nocover
52+
53+
class _NvidiaQuerier(_GPUQuerier):
54+
command = 'nvidia-smi'
55+
56+
@classmethod
57+
def _query_gpu(cls, *args: list[str], ram_column: str | None = None):
58+
gpu_info = super()._query_gpu(*args, '--format=csv')
59+
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
60+
gpu_info[ram_column] = gpu_info[ram_column].apply(lambda ram: int(ram.replace('MiB', '').strip()))
61+
return gpu_info.rename(columns={ram_column: 'ram'})
62+
63+
@classmethod
64+
def static_info(cls) -> pd.DataFrame:
65+
return cls._query_gpu('--query-gpu=uuid,memory.total', ram_column='memory.total')
66+
67+
@classmethod
68+
def process_ram(cls) -> pd.DataFrame:
69+
return cls._query_gpu('--query-compute-apps=pid,used_gpu_memory', ram_column='used_gpu_memory')
70+
71+
@classmethod
72+
def ram_and_utilization(cls) -> pd.DataFrame:
73+
gpu_info = cls._query_gpu('--query-gpu=uuid,memory.used,utilization.gpu', ram_column='memory.used')
74+
gpu_info = gpu_info.rename(columns={'utilization.gpu': 'utilization_percent'})
75+
gpu_info.utilization_percent = [float(percentage.replace('%', '').strip()) for percentage in gpu_info.utilization_percent]
76+
return gpu_info
77+
78+
class _AMDQuerier(_GPUQuerier):
79+
command = 'amd-smi'
80+
2081
class _TrackingProcess(mproc.Process):
2182
_CPU_PERCENT_INTERVAL = 0.1
2283
_ram_unit2coefficient = {
@@ -43,7 +104,7 @@ class _TrackingProcess(mproc.Process):
43104
def __init__(
44105
self, stop_event: mproc.Event, sleep_time: float, ram_unit: str, gpu_ram_unit: str, time_unit: str,
45106
n_expected_cores: int | None, gpu_uuids: set[str] | None, disable_logs: bool, main_process_id: int,
46-
resource_usage_file: str, extraneous_process_ids: set[int]):
107+
resource_usage_file: str, extraneous_process_ids: set[int], gpu_brand: str | None):
47108
super().__init__()
48109
self._stop_event = stop_event
49110
if sleep_time < _TrackingProcess._CPU_PERCENT_INTERVAL:
@@ -63,24 +124,45 @@ def __init__(
63124
self._hardware_percent_sums = {key: 0. for key in percent_keys}
64125
self._tracking_iteration = 1
65126
self._is_linux = platform.system().lower() == 'linux'
66-
self._nvidia_available = True
67-
try:
68-
subp.check_output('nvidia-smi')
69-
except FileNotFoundError:
70-
self._nvidia_available = False
71-
self._log_warning(
72-
'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. '
73-
'Otherwise the Max GPU RAM values will remain 0.0')
127+
cannot_connect_warning = ('The {} command is installed but cannot connect to a GPU. '
128+
'The GPU RAM and GPU utilization values will remain 0.0.')
129+
if gpu_brand is None:
130+
nvidia_available = _NvidiaQuerier.is_available()
131+
nvidia_installed = nvidia_available is not None
132+
nvidia_available = bool(nvidia_available)
133+
amd_available = _AMDQuerier.is_available()
134+
amd_installed = amd_available is not None
135+
amd_available = bool(amd_available)
136+
if nvidia_available:
137+
gpu_brand = 'nvidia'
138+
elif amd_available:
139+
gpu_brand = 'amd'
140+
elif nvidia_installed:
141+
self._log_warning(cannot_connect_warning.format('nvidia-smi'))
142+
elif amd_installed:
143+
self._log_warning(cannot_connect_warning.format('amd-smi'))
144+
else:
145+
self._log_warning(
146+
'Neither the nvidia-smi command nor the amd-smi command is installed. Install one of these to profile the GPU. '
147+
'Otherwise the GPU RAM and GPU utilization values will remain 0.0.')
148+
if gpu_brand == 'nvidia':
149+
self._gpu_querier = _NvidiaQuerier
150+
elif gpu_brand == 'amd':
151+
self._gpu_querier = _AMDQuerier
152+
elif gpu_brand is None:
153+
self._gpu_querier = None
154+
else:
155+
raise ValueError(f'"{gpu_brand}" is not a valid GPU brand. Supported values are "nvidia" and "amd".')
74156
max_ram = MaxRAM(unit=ram_unit, system_capacity=psutil.virtual_memory().total * self._ram_coefficient)
75157
system_core_count = psutil.cpu_count()
76158
cpu_utilization = CPUUtilization(
77159
system_core_count=system_core_count,
78160
n_expected_cores=n_expected_cores if n_expected_cores is not None else system_core_count)
79-
if self._nvidia_available:
80-
gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.total')
81-
gpu_ram_system_capacity = self._get_gpu_ram(gpu_info=gpu_info, column='memory.total')
161+
if self._gpu_querier:
162+
gpu_info = self._gpu_querier.static_info()
163+
gpu_ram_system_capacity = self._get_gpu_ram(gpu_info=gpu_info)
82164
max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=gpu_ram_system_capacity)
83-
all_uuids = set(gpu_info['uuid'])
165+
all_uuids = set(gpu_info.uuid)
84166
if gpu_uuids is None:
85167
self._gpu_uuids = all_uuids
86168
else:
@@ -143,25 +225,23 @@ def run(self):
143225
self._resource_usage.max_ram.system = max(
144226
self._resource_usage.max_ram.system, psutil.virtual_memory().used * self._ram_coefficient)
145227
# Get the maximum GPU RAM usage if available.
146-
if self._nvidia_available: # pragma: nocover
147-
gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-compute-apps=pid,used_gpu_memory')
228+
if self._gpu_querier: # pragma: nocover
229+
gpu_info = self._gpu_querier.process_ram()
148230
if len(gpu_info):
149231
process_ids = {self._main_process_id}
150232
self._update_gpu_ram(attr='main', process_ids=process_ids, gpu_info=gpu_info)
151233
process_ids = set(self._map_processes(processes=descendant_processes, map_func=lambda process: process.pid))
152234
self._update_gpu_ram(attr='descendants', process_ids=process_ids, gpu_info=gpu_info)
153235
process_ids.add(self._main_process_id)
154236
self._update_gpu_ram(attr='combined', process_ids=process_ids, gpu_info=gpu_info)
155-
gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.used,utilization.gpu')
156-
system_gpu_ram = self._get_gpu_ram(gpu_info, column='memory.used')
237+
gpu_info = self._gpu_querier.ram_and_utilization()
238+
system_gpu_ram = self._get_gpu_ram(gpu_info)
157239
self._resource_usage.max_gpu_ram.system = max(self._resource_usage.max_gpu_ram.system, system_gpu_ram)
158-
gpu_info = gpu_info.loc[gpu_info['uuid'].apply(lambda gpu_uuid: gpu_uuid in self._gpu_uuids)]
159-
gpu_percentages = [float(percentage.replace('%', '').strip()) for percentage in gpu_info['utilization.gpu']]
240+
gpu_info = gpu_info.loc[[uuid in self._gpu_uuids for uuid in gpu_info.uuid]]
160241
self._update_processing_unit_utilization(
161-
current_percentages=gpu_percentages,
242+
current_percentages=list(gpu_info.utilization_percent),
162243
processing_unit_percentages=self._resource_usage.gpu_utilization.gpu_percentages, percent_key='gpu',
163244
n_hardware_units=self._resource_usage.gpu_utilization.n_expected_gpus)
164-
165245
# Get the mean and maximum CPU usages.
166246
main_n_threads = self._map_processes([main_process], map_func=get_n_threads)
167247
descendant_n_threads = self._map_processes(descendant_processes, map_func=get_n_threads)
@@ -230,23 +310,13 @@ def _update_ram(self, rss_values: RSSValues, memory_maps_list: list[list] | None
230310
rss_values.total_rss = max(rss_values.total_rss, total_rss)
231311

232312
def _update_gpu_ram(self, attr: str, process_ids: set[int], gpu_info: pd.DataFrame):
233-
gpu_info = gpu_info.loc[[pid in process_ids for pid in gpu_info['pid']]]
234-
gpu_ram = self._get_gpu_ram(gpu_info, column='used_gpu_memory')
313+
gpu_info = gpu_info.loc[[pid in process_ids for pid in gpu_info.pid]]
314+
gpu_ram = self._get_gpu_ram(gpu_info)
235315
max_gpu_ram = getattr(self._resource_usage.max_gpu_ram, attr)
236316
setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, gpu_ram))
237317

238-
@staticmethod
239-
def _query_gpu(nvidia_command: str) -> pd.DataFrame:
240-
command = f'nvidia-smi {nvidia_command} --format=csv'
241-
output = subp.check_output(command.split(), stderr=subp.STDOUT).decode()
242-
gpu_info = pd.read_csv(io.StringIO(output))
243-
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
244-
return gpu_info.map(lambda value: value.strip() if type(value) is str else value)
245-
246-
def _get_gpu_ram(self, gpu_info: pd.DataFrame, column: str) -> float:
247-
gpu_rams = gpu_info[column]
248-
gpu_rams = gpu_rams.apply(lambda ram: int(ram.replace('MiB', '').strip()))
249-
return sum(gpu_rams) * self._gpu_ram_coefficient
318+
def _get_gpu_ram(self, gpu_info: pd.DataFrame) -> float:
319+
return sum(gpu_info.ram) * self._gpu_ram_coefficient
250320

251321
def _update_processing_unit_utilization(
252322
self, current_percentages: list[float], processing_unit_percentages: ProcessingUnitPercentages,
@@ -297,7 +367,8 @@ class State(enum.Enum):
297367
def __init__(
298368
self, sleep_time: float = 1.0, ram_unit: str = 'gigabytes', gpu_ram_unit: str = 'gigabytes', time_unit: str = 'hours',
299369
n_expected_cores: int = None, gpu_uuids: set[str] = None, disable_logs: bool = False, process_id: int = None,
300-
resource_usage_file: str | None = None, n_join_attempts: int = 5, join_timeout: float = 10.0):
370+
resource_usage_file: str | None = None, n_join_attempts: int = 5, join_timeout: float = 10.0,
371+
gpu_brand: str | None = None):
301372
"""
302373
:param sleep_time: The number of seconds to sleep in between usage-collection iterations.
303374
:param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.
@@ -310,6 +381,7 @@ def __init__(
310381
:param resource_usage_file: The file path to the pickle file containing the ``resource_usage`` attribute. This file is automatically deleted and the ``resource_usage`` attribute is set in memory if the tracking successfully completes. But if the tracking is interrupted, the tracking information will be saved in this file as a backup. Defaults to a randomly generated file name in the current working directory of the format ``.gpu-tracker_<random UUID>.pkl``.
311382
:param n_join_attempts: The number of times the tracker attempts to join its underlying sub-process.
312383
:param join_timeout: The amount of time the tracker waits for its underlying sub-process to join.
384+
:param gpu_brand: The brand of GPU to profile. Valid values are "nvidia" and "amd". Defaults to the brand of GPU detected in the system, checking Nvidia first.
313385
:raises ValueError: Raised if invalid units are provided.
314386
"""
315387
current_process_id = os.getpid()
@@ -323,7 +395,7 @@ def __init__(
323395
self._resource_usage_file = f'.gpu-tracker_{uuid.uuid1()}.pkl' if resource_usage_file is None else resource_usage_file
324396
self._tracking_process = _TrackingProcess(
325397
self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, n_expected_cores, gpu_uuids, disable_logs,
326-
process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids)
398+
process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids, gpu_brand)
327399
self.resource_usage = None
328400
self.n_join_attempts = n_join_attempts
329401
self.join_timeout = join_timeout

tests/test_cli.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,16 @@ def get_output(request) -> str | None:
1717
test_data = [
1818
(['-e', 'my-command', '--ru=kilobytes'], ['my-command'], {'disable_logs': False, 'ram_unit': 'kilobytes'}),
1919
(['--execute', 'my-command arg1 ', '--disable-logs'], ['my-command', 'arg1'], {'disable_logs': True}),
20-
(['--execute=my-command arg1 arg2', '--st=0.4'], ['my-command', 'arg1', 'arg2'], {'disable_logs': False, 'sleep_time': 0.4}),
2120
(
22-
['-e', 'my-command', '--gru=megabytes', '--tu=days'], ['my-command'],
23-
{'disable_logs': False, 'gpu_ram_unit': 'megabytes', 'time_unit': 'days'}),
21+
['--execute=my-command arg1 arg2', '--st=0.4', '--gb=nvidia'], ['my-command', 'arg1', 'arg2'],
22+
{'disable_logs': False, 'sleep_time': 0.4, 'gpu_brand': 'nvidia'}
23+
),
2424
(
25-
['-e', 'my-command', '--nec=3', '--guuids=gpu-id1,gpu-id2,gpu-id3'], ['my-command'],
26-
{'disable_logs': False, 'n_expected_cores': 3, 'gpu_uuids': {'gpu-id1', 'gpu-id2', 'gpu-id3'}}),
25+
['-e', 'my-command', '--gru=megabytes', '--tu=days', '--gb=amd'], ['my-command'],
26+
{'disable_logs': False, 'gpu_ram_unit': 'megabytes', 'time_unit': 'days', 'gpu_brand': 'amd'}),
27+
(
28+
['-e', 'my-command', '--nec=3', '--guuids=gpu-id1,gpu-id2,gpu-id3', '--gb=amd'], ['my-command'],
29+
{'disable_logs': False, 'n_expected_cores': 3, 'gpu_uuids': {'gpu-id1', 'gpu-id2', 'gpu-id3'}, 'gpu_brand': 'amd'}),
2730
(['-e', 'my-command', '--guuids=gpu-id1'], ['my-command'], {'disable_logs': False, 'gpu_uuids': {'gpu-id1'}})]
2831

2932

0 commit comments

Comments
 (0)