Skip to content

Commit 2c006ec

Browse files
authored
Merge pull request #56 from MoseleyBioinformaticsLab/amd
Implements the AMD GPU querier class
2 parents dedc361 + a724702 commit 2c006ec

File tree

2 files changed

+73
-16
lines changed

2 files changed

+73
-16
lines changed

src/gpu_tracker/tracker.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class _NvidiaQuerier(_GPUQuerier):
5454
command = 'nvidia-smi'
5555

5656
@classmethod
57-
def _query_gpu(cls, *args: list[str], ram_column: str | None = None):
57+
def _query_gpu(cls, *args: list[str], ram_column: str):
5858
gpu_info = super()._query_gpu(*args, '--format=csv')
5959
gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns]
6060
gpu_info[ram_column] = gpu_info[ram_column].apply(lambda ram: int(ram.replace('MiB', '').strip()))
@@ -75,8 +75,45 @@ def ram_and_utilization(cls) -> pd.DataFrame:
7575
gpu_info.utilization_percent = [float(percentage.replace('%', '').strip()) for percentage in gpu_info.utilization_percent]
7676
return gpu_info
7777

78+
7879
class _AMDQuerier(_GPUQuerier):
7980
command = 'amd-smi'
81+
__id_to_uuid = None
82+
83+
@classmethod
84+
@property
85+
def _id_to_uuid(cls) -> dict[int, str]:
86+
if cls.__id_to_uuid is None:
87+
gpu_info = super()._query_gpu('list', '--csv')
88+
cls.__id_to_uuid = {gpu_id: uuid for gpu_id, uuid in zip(gpu_info.gpu, gpu_info.gpu_uuid)}
89+
return cls.__id_to_uuid
90+
91+
@classmethod
92+
def _query_gpu(cls, *args: list[str], ram_column: str) -> pd.DataFrame:
93+
gpu_info = super()._query_gpu(*args, '--csv')
94+
if 'gpu' in gpu_info.columns:
95+
gpu_info.gpu = [cls._id_to_uuid[gpu_id] for gpu_id in gpu_info.gpu]
96+
gpu_info = gpu_info.rename(columns={'gpu': 'uuid'})
97+
return gpu_info.rename(columns={ram_column: 'ram'})
98+
99+
@classmethod
100+
def static_info(cls) -> pd.DataFrame:
101+
gpu_info = cls._query_gpu('static', '--vram', ram_column='size')
102+
return gpu_info[['uuid', 'ram']]
103+
104+
@classmethod
105+
def process_ram(cls) -> pd.DataFrame:
106+
gpu_info = cls._query_gpu('process', ram_column='vram_mem')
107+
gpu_info.ram = [ram / 1e6 for ram in gpu_info.ram] # RAM is in bytes for the process subcommand.
108+
return gpu_info[['pid', 'ram']]
109+
110+
@classmethod
111+
def ram_and_utilization(cls) -> pd.DataFrame:
112+
gpu_info = cls._query_gpu('monitor', '--vram-usage', '--gfx', ram_column='vram_used')
113+
gpu_info = gpu_info[['uuid', 'gfx', 'ram']]
114+
gpu_info.gfx = gpu_info.gfx.astype(float)
115+
return gpu_info.rename(columns={'gfx': 'utilization_percent'})
116+
80117

81118
class _TrackingProcess(mproc.Process):
82119
_CPU_PERCENT_INTERVAL = 0.1

tests/test_tracker.py

+35-15
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
'GPU. Otherwise the GPU RAM and GPU utilization values will remain 0.0.')
1111

1212

13+
@pt.fixture(name='gpu_brand', params=['amd', 'nvidia'])
14+
def get_gpu_brand(request) -> str:
15+
yield request.param
16+
17+
1318
@pt.fixture(name='operating_system', params=['Linux', 'not-linux'])
1419
def get_operating_system(request) -> str:
1520
yield request.param
@@ -30,8 +35,8 @@ def get_use_context_manager(request) -> bool:
3035

3136
@pt.mark.parametrize('ram_unit,gpu_ram_unit,time_unit,gpu_uuids,n_expected_cores', test_tracker_data)
3237
def test_tracker(
33-
mocker, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str, gpu_uuids: set[str],
34-
n_expected_cores: int):
38+
mocker, gpu_brand: str, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str,
39+
gpu_uuids: set[str], n_expected_cores: int):
3540
class EventMock:
3641
def __init__(self):
3742
self.count = 0
@@ -126,17 +131,31 @@ def start_mock(self):
126131
'gpu_tracker.tracker.psutil.virtual_memory', side_effect=[
127132
mocker.MagicMock(total=67 * 1e9), mocker.MagicMock(used=30 * 1e9), mocker.MagicMock(used=31 * 1e9),
128133
mocker.MagicMock(used=29 * 1e9)])
129-
nvidia_smi_outputs = [
130-
b'',
131-
b'',
132-
b' uuid,memory.total [MiB]\ngpu-id1,12198 MiB\ngpu-id2,12198 MiB\ngpu-id3 , 12198MiB',
133-
b'pid, used_gpu_memory [MiB]\n',
134-
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 0 MiB, 0 %\ngpu-id2 , 0 MiB, 0 %\ngpu-id3 , 0 MiB, 0 %',
135-
b'pid, used_gpu_memory [MiB]\n12,1600 MiB\n21,700 MiB\n22,200 MiB',
136-
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1600 MiB,75 %\ngpu-id2,900 MiB , 50 %\n gpu-id3, 500 MiB, 25 %',
137-
b'pid, used_gpu_memory [MiB]\n12,1500 MiB\n21,2100 MiB\n22,2200 MiB',
138-
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1500 MiB, 55 %\n gpu-id2, 4300 MiB, 45%\ngpu-id3,700MiB,35%']
139-
check_output_mock = mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=nvidia_smi_outputs)
134+
if gpu_brand == 'nvidia':
135+
check_output_side_effect = [
136+
b'',
137+
b'',
138+
b' uuid,memory.total [MiB]\ngpu-id1,12198 MiB\ngpu-id2,12198 MiB\ngpu-id3 , 12198MiB',
139+
b'pid, used_gpu_memory [MiB]\n',
140+
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 0 MiB, 0 %\ngpu-id2 , 0 MiB, 0 %\ngpu-id3 , 0 MiB, 0 %',
141+
b'pid, used_gpu_memory [MiB]\n12,1600 MiB\n21,700 MiB\n22,200 MiB',
142+
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1600 MiB,75 %\ngpu-id2,900 MiB , 50 %\n gpu-id3, 500 MiB, 25 %',
143+
b'pid, used_gpu_memory [MiB]\n12,1500 MiB\n21,2100 MiB\n22,2200 MiB',
144+
b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1500 MiB, 55 %\n gpu-id2, 4300 MiB, 45%\ngpu-id3,700MiB,35%']
145+
else:
146+
check_output_side_effect = [
147+
FileNotFoundError,
148+
b'',
149+
b'gpu,size,extraneous-col\n0,12198,some-val\n1,12198,some-val\n2 , 12198,some-val',
150+
b'gpu,gpu_uuid,extraneous-col\n0,gpu-id1,some-val\n1,gpu-id2,some-val\n2,gpu-id3 ,some-val',
151+
b'pid,vram_mem\n',
152+
b'gpu,vram_used,gfx\n0,0,0\n1 ,0,0\n2 ,0,0',
153+
b'pid,vram_mem\n12,1600000000\n21,700000000\n22,200000000',
154+
b'gpu,vram_used,gfx\n0,1600,75\n1,900,50\n2,500,25',
155+
b'pid,vram_mem\n12,1500000000\n21,2100000000\n22,2200000000',
156+
b'gpu,vram_used,gfx\n0,1500,55\n1,4300,45\n2,700,35'
157+
]
158+
check_output_mock = mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=check_output_side_effect)
140159
cpu_count_mock = mocker.patch('gpu_tracker.tracker.psutil.cpu_count', return_value=4)
141160
cpu_percent_mock = mocker.patch(
142161
'gpu_tracker.tracker.psutil.cpu_percent', side_effect=[[67.5, 27.3, 77.8, 97.9], [57.6, 58.2, 23.5, 99.8], [78.3, 88.3, 87.2, 22.5]])
@@ -157,6 +176,7 @@ def start_mock(self):
157176
gpu_uuids=gpu_uuids, n_expected_cores=n_expected_cores)
158177
tracker.start()
159178
tracker.stop()
179+
gput.tracker._AMDQuerier._AMDQuerier__id_to_uuid = None
160180
assert start_mock.called
161181
assert not os.path.isfile(tracker._resource_usage_file)
162182
assert not log_spy.called
@@ -175,7 +195,7 @@ def start_mock(self):
175195
utils.assert_args_list(mock=main_process_mock.memory_info, expected_args_list=[()] * 3)
176196
utils.assert_args_list(mock=child1_mock.memory_info, expected_args_list=[()] * 3)
177197
utils.assert_args_list(mock=child2_mock.memory_info, expected_args_list=[()] * 3)
178-
assert len(check_output_mock.call_args_list) == 9
198+
assert len(check_output_mock.call_args_list) == 10 if gpu_brand == 'amd' else 9
179199
os_mock.getpid.assert_called_once_with()
180200
utils.assert_args_list(mock=time_mock.time, expected_args_list=[()] * 5)
181201
cpu_percent_interval = gput.tracker._TrackingProcess._CPU_PERCENT_INTERVAL
@@ -211,7 +231,7 @@ def side_effect_func(command, *_, **__) -> None:
211231
# The check_output mock is called 3 times before it's supposed to, causing a "RuntimeError: generator raised StopIteration".
212232
if command in ('nvidia-smi', 'amd-smi'):
213233
raise exceptions.pop()
214-
raise FileNotFoundError()
234+
raise FileNotFoundError() # pragma: nocover
215235
mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=side_effect_func)
216236
gput.Tracker()
217237
gput.Tracker()

0 commit comments

Comments
 (0)