1
1
"""The ``tracker`` module contains the ``Tracker`` class which can alternatively be imported directly from the ``gpu_tracker`` package."""
2
2
from __future__ import annotations
3
+ import abc
3
4
import json
4
5
import dataclasses as dclass
5
6
import platform
17
18
import pandas as pd
18
19
19
20
21
+ class _GPUQuerier (abc .ABC ):
22
+ @classmethod
23
+ def _query_gpu (cls , * args ) -> pd .DataFrame :
24
+ output = subp .check_output ((cls .command ,) + args , stderr = subp .STDOUT ).decode ()
25
+ gpu_info = pd .read_csv (io .StringIO (output ))
26
+ return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
27
+
28
+ @classmethod
29
+ def is_available (cls ) -> bool | None :
30
+ try :
31
+ subp .check_output (cls .command )
32
+ return True
33
+ except subp .CalledProcessError :
34
+ return False
35
+ except FileNotFoundError :
36
+ return None
37
+
38
+ @classmethod
39
+ @abc .abstractmethod
40
+ def static_info (cls ) -> pd .DataFrame :
41
+ pass # pragma: nocover
42
+
43
+ @classmethod
44
+ @abc .abstractmethod
45
+ def process_ram (cls ) -> pd .DataFrame :
46
+ pass # pragma: nocover
47
+
48
+ @classmethod
49
+ @abc .abstractmethod
50
+ def ram_and_utilization (cls ) -> pd .DataFrame :
51
+ pass # pragma: nocover
52
+
53
+ class _NvidiaQuerier (_GPUQuerier ):
54
+ command = 'nvidia-smi'
55
+
56
+ @classmethod
57
+ def _query_gpu (cls , * args : list [str ], ram_column : str | None = None ):
58
+ gpu_info = super ()._query_gpu (* args , '--format=csv' )
59
+ gpu_info .columns = [col .replace ('[MiB]' , '' ).replace ('[%]' , '' ).strip () for col in gpu_info .columns ]
60
+ gpu_info [ram_column ] = gpu_info [ram_column ].apply (lambda ram : int (ram .replace ('MiB' , '' ).strip ()))
61
+ return gpu_info .rename (columns = {ram_column : 'ram' })
62
+
63
+ @classmethod
64
+ def static_info (cls ) -> pd .DataFrame :
65
+ return cls ._query_gpu ('--query-gpu=uuid,memory.total' , ram_column = 'memory.total' )
66
+
67
+ @classmethod
68
+ def process_ram (cls ) -> pd .DataFrame :
69
+ return cls ._query_gpu ('--query-compute-apps=pid,used_gpu_memory' , ram_column = 'used_gpu_memory' )
70
+
71
+ @classmethod
72
+ def ram_and_utilization (cls ) -> pd .DataFrame :
73
+ gpu_info = cls ._query_gpu ('--query-gpu=uuid,memory.used,utilization.gpu' , ram_column = 'memory.used' )
74
+ gpu_info = gpu_info .rename (columns = {'utilization.gpu' : 'utilization_percent' })
75
+ gpu_info .utilization_percent = [float (percentage .replace ('%' , '' ).strip ()) for percentage in gpu_info .utilization_percent ]
76
+ return gpu_info
77
+
78
+ class _AMDQuerier (_GPUQuerier ):
79
+ command = 'amd-smi'
80
+
20
81
class _TrackingProcess (mproc .Process ):
21
82
_CPU_PERCENT_INTERVAL = 0.1
22
83
_ram_unit2coefficient = {
@@ -43,7 +104,7 @@ class _TrackingProcess(mproc.Process):
43
104
def __init__ (
44
105
self , stop_event : mproc .Event , sleep_time : float , ram_unit : str , gpu_ram_unit : str , time_unit : str ,
45
106
n_expected_cores : int | None , gpu_uuids : set [str ] | None , disable_logs : bool , main_process_id : int ,
46
- resource_usage_file : str , extraneous_process_ids : set [int ]):
107
+ resource_usage_file : str , extraneous_process_ids : set [int ], gpu_brand : str | None ):
47
108
super ().__init__ ()
48
109
self ._stop_event = stop_event
49
110
if sleep_time < _TrackingProcess ._CPU_PERCENT_INTERVAL :
@@ -63,24 +124,45 @@ def __init__(
63
124
self ._hardware_percent_sums = {key : 0. for key in percent_keys }
64
125
self ._tracking_iteration = 1
65
126
self ._is_linux = platform .system ().lower () == 'linux'
66
- self ._nvidia_available = True
67
- try :
68
- subp .check_output ('nvidia-smi' )
69
- except FileNotFoundError :
70
- self ._nvidia_available = False
71
- self ._log_warning (
72
- 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. '
73
- 'Otherwise the Max GPU RAM values will remain 0.0' )
127
+ cannot_connect_warning = ('The {} command is installed but cannot connect to a GPU. '
128
+ 'The GPU RAM and GPU utilization values will remain 0.0.' )
129
+ if gpu_brand is None :
130
+ nvidia_available = _NvidiaQuerier .is_available ()
131
+ nvidia_installed = nvidia_available is not None
132
+ nvidia_available = bool (nvidia_available )
133
+ amd_available = _AMDQuerier .is_available ()
134
+ amd_installed = amd_available is not None
135
+ amd_available = bool (amd_available )
136
+ if nvidia_available :
137
+ gpu_brand = 'nvidia'
138
+ elif amd_available :
139
+ gpu_brand = 'amd'
140
+ elif nvidia_installed :
141
+ self ._log_warning (cannot_connect_warning .format ('nvidia-smi' ))
142
+ elif amd_installed :
143
+ self ._log_warning (cannot_connect_warning .format ('amd-smi' ))
144
+ else :
145
+ self ._log_warning (
146
+ 'Neither the nvidia-smi command nor the amd-smi command is installed. Install one of these to profile the GPU. '
147
+ 'Otherwise the GPU RAM and GPU utilization values will remain 0.0.' )
148
+ if gpu_brand == 'nvidia' :
149
+ self ._gpu_querier = _NvidiaQuerier
150
+ elif gpu_brand == 'amd' :
151
+ self ._gpu_querier = _AMDQuerier
152
+ elif gpu_brand is None :
153
+ self ._gpu_querier = None
154
+ else :
155
+ raise ValueError (f'"{ gpu_brand } " is not a valid GPU brand. Supported values are "nvidia" and "amd".' )
74
156
max_ram = MaxRAM (unit = ram_unit , system_capacity = psutil .virtual_memory ().total * self ._ram_coefficient )
75
157
system_core_count = psutil .cpu_count ()
76
158
cpu_utilization = CPUUtilization (
77
159
system_core_count = system_core_count ,
78
160
n_expected_cores = n_expected_cores if n_expected_cores is not None else system_core_count )
79
- if self ._nvidia_available :
80
- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-gpu=uuid,memory.total' )
81
- gpu_ram_system_capacity = self ._get_gpu_ram (gpu_info = gpu_info , column = 'memory.total' )
161
+ if self ._gpu_querier :
162
+ gpu_info = self . _gpu_querier . static_info ( )
163
+ gpu_ram_system_capacity = self ._get_gpu_ram (gpu_info = gpu_info )
82
164
max_gpu_ram = MaxGPURAM (unit = gpu_ram_unit , system_capacity = gpu_ram_system_capacity )
83
- all_uuids = set (gpu_info [ ' uuid' ] )
165
+ all_uuids = set (gpu_info . uuid )
84
166
if gpu_uuids is None :
85
167
self ._gpu_uuids = all_uuids
86
168
else :
@@ -143,25 +225,23 @@ def run(self):
143
225
self ._resource_usage .max_ram .system = max (
144
226
self ._resource_usage .max_ram .system , psutil .virtual_memory ().used * self ._ram_coefficient )
145
227
# Get the maximum GPU RAM usage if available.
146
- if self ._nvidia_available : # pragma: nocover
147
- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-compute-apps=pid,used_gpu_memory' )
228
+ if self ._gpu_querier : # pragma: nocover
229
+ gpu_info = self . _gpu_querier . process_ram ( )
148
230
if len (gpu_info ):
149
231
process_ids = {self ._main_process_id }
150
232
self ._update_gpu_ram (attr = 'main' , process_ids = process_ids , gpu_info = gpu_info )
151
233
process_ids = set (self ._map_processes (processes = descendant_processes , map_func = lambda process : process .pid ))
152
234
self ._update_gpu_ram (attr = 'descendants' , process_ids = process_ids , gpu_info = gpu_info )
153
235
process_ids .add (self ._main_process_id )
154
236
self ._update_gpu_ram (attr = 'combined' , process_ids = process_ids , gpu_info = gpu_info )
155
- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-gpu=uuid,memory.used,utilization.gpu' )
156
- system_gpu_ram = self ._get_gpu_ram (gpu_info , column = 'memory.used' )
237
+ gpu_info = self . _gpu_querier . ram_and_utilization ( )
238
+ system_gpu_ram = self ._get_gpu_ram (gpu_info )
157
239
self ._resource_usage .max_gpu_ram .system = max (self ._resource_usage .max_gpu_ram .system , system_gpu_ram )
158
- gpu_info = gpu_info .loc [gpu_info ['uuid' ].apply (lambda gpu_uuid : gpu_uuid in self ._gpu_uuids )]
159
- gpu_percentages = [float (percentage .replace ('%' , '' ).strip ()) for percentage in gpu_info ['utilization.gpu' ]]
240
+ gpu_info = gpu_info .loc [[uuid in self ._gpu_uuids for uuid in gpu_info .uuid ]]
160
241
self ._update_processing_unit_utilization (
161
- current_percentages = gpu_percentages ,
242
+ current_percentages = list ( gpu_info . utilization_percent ) ,
162
243
processing_unit_percentages = self ._resource_usage .gpu_utilization .gpu_percentages , percent_key = 'gpu' ,
163
244
n_hardware_units = self ._resource_usage .gpu_utilization .n_expected_gpus )
164
-
165
245
# Get the mean and maximum CPU usages.
166
246
main_n_threads = self ._map_processes ([main_process ], map_func = get_n_threads )
167
247
descendant_n_threads = self ._map_processes (descendant_processes , map_func = get_n_threads )
@@ -230,23 +310,13 @@ def _update_ram(self, rss_values: RSSValues, memory_maps_list: list[list] | None
230
310
rss_values .total_rss = max (rss_values .total_rss , total_rss )
231
311
232
312
def _update_gpu_ram (self , attr : str , process_ids : set [int ], gpu_info : pd .DataFrame ):
233
- gpu_info = gpu_info .loc [[pid in process_ids for pid in gpu_info [ ' pid' ] ]]
234
- gpu_ram = self ._get_gpu_ram (gpu_info , column = 'used_gpu_memory' )
313
+ gpu_info = gpu_info .loc [[pid in process_ids for pid in gpu_info . pid ]]
314
+ gpu_ram = self ._get_gpu_ram (gpu_info )
235
315
max_gpu_ram = getattr (self ._resource_usage .max_gpu_ram , attr )
236
316
setattr (self ._resource_usage .max_gpu_ram , attr , max (max_gpu_ram , gpu_ram ))
237
317
238
- @staticmethod
239
- def _query_gpu (nvidia_command : str ) -> pd .DataFrame :
240
- command = f'nvidia-smi { nvidia_command } --format=csv'
241
- output = subp .check_output (command .split (), stderr = subp .STDOUT ).decode ()
242
- gpu_info = pd .read_csv (io .StringIO (output ))
243
- gpu_info .columns = [col .replace ('[MiB]' , '' ).replace ('[%]' , '' ).strip () for col in gpu_info .columns ]
244
- return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
245
-
246
- def _get_gpu_ram (self , gpu_info : pd .DataFrame , column : str ) -> float :
247
- gpu_rams = gpu_info [column ]
248
- gpu_rams = gpu_rams .apply (lambda ram : int (ram .replace ('MiB' , '' ).strip ()))
249
- return sum (gpu_rams ) * self ._gpu_ram_coefficient
318
+ def _get_gpu_ram (self , gpu_info : pd .DataFrame ) -> float :
319
+ return sum (gpu_info .ram ) * self ._gpu_ram_coefficient
250
320
251
321
def _update_processing_unit_utilization (
252
322
self , current_percentages : list [float ], processing_unit_percentages : ProcessingUnitPercentages ,
@@ -297,7 +367,8 @@ class State(enum.Enum):
297
367
def __init__ (
298
368
self , sleep_time : float = 1.0 , ram_unit : str = 'gigabytes' , gpu_ram_unit : str = 'gigabytes' , time_unit : str = 'hours' ,
299
369
n_expected_cores : int = None , gpu_uuids : set [str ] = None , disable_logs : bool = False , process_id : int = None ,
300
- resource_usage_file : str | None = None , n_join_attempts : int = 5 , join_timeout : float = 10.0 ):
370
+ resource_usage_file : str | None = None , n_join_attempts : int = 5 , join_timeout : float = 10.0 ,
371
+ gpu_brand : str | None = None ):
301
372
"""
302
373
:param sleep_time: The number of seconds to sleep in between usage-collection iterations.
303
374
:param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.
@@ -310,6 +381,7 @@ def __init__(
310
381
:param resource_usage_file: The file path to the pickle file containing the ``resource_usage`` attribute. This file is automatically deleted and the ``resource_usage`` attribute is set in memory if the tracking successfully completes. But if the tracking is interrupted, the tracking information will be saved in this file as a backup. Defaults to a randomly generated file name in the current working directory of the format ``.gpu-tracker_<random UUID>.pkl``.
311
382
:param n_join_attempts: The number of times the tracker attempts to join its underlying sub-process.
312
383
:param join_timeout: The amount of time the tracker waits for its underlying sub-process to join.
384
+ :param gpu_brand: The brand of GPU to profile. Valid values are "nvidia" and "amd". Defaults to the brand of GPU detected in the system, checking Nvidia first.
313
385
:raises ValueError: Raised if invalid units are provided.
314
386
"""
315
387
current_process_id = os .getpid ()
@@ -323,7 +395,7 @@ def __init__(
323
395
self ._resource_usage_file = f'.gpu-tracker_{ uuid .uuid1 ()} .pkl' if resource_usage_file is None else resource_usage_file
324
396
self ._tracking_process = _TrackingProcess (
325
397
self ._stop_event , sleep_time , ram_unit , gpu_ram_unit , time_unit , n_expected_cores , gpu_uuids , disable_logs ,
326
- process_id if process_id is not None else current_process_id , self ._resource_usage_file , extraneous_ids )
398
+ process_id if process_id is not None else current_process_id , self ._resource_usage_file , extraneous_ids , gpu_brand )
327
399
self .resource_usage = None
328
400
self .n_join_attempts = n_join_attempts
329
401
self .join_timeout = join_timeout
0 commit comments