55from decimal import Decimal , ROUND_HALF_UP
66import datetime
77
8- # GPU types
9- GPU_A100 = "NVIDIA-A100-40GB"
10- GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
11- GPU_V100 = "Tesla-V100-PCIE-32GB"
12- GPU_H100 = "NVIDIA-H100-80GB-HBM3"
13- GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
14-
15- # GPU Resource - MIG Geometries
16- # A100 Strategies
17- MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
18- MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
19- MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
20- WHOLE_GPU = "nvidia.com/gpu"
21-
22- # VM GPU Resources
23- VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
24- VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
25- VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
26-
27- # SU Types
28- SU_CPU = "OpenShift CPU"
29- SU_A100_GPU = "OpenShift GPUA100"
30- SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
31- SU_V100_GPU = "OpenShift GPUV100"
32- SU_H100_GPU = "OpenShift GPUH100"
33- SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
34- SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
35- SU_UNKNOWN = "Openshift Unknown"
36-
37- ServiceUnit = namedtuple ("ServiceUnit" , ["su_type" , "su_count" , "determinig_resource" ])
38-
8+ SU_STORAGE = "OpenShift Storage"
399
4010@dataclass
41- class Pod :
42- """Object that represents a pod """
43-
44- pod_name : str
11+ class PVC :
12+ """an object that represents a pvc """
13+ volume : str
14+ persistent_volume_claim : str
4515 namespace : str
4616 start_time : int
4717 duration : int
48- cpu_request : Decimal
49- gpu_request : Decimal
50- memory_request : Decimal
51- gpu_type : str
52- gpu_resource : str
53- node_hostname : str
54- node_model : str
55-
56- def get_service_unit (self , su_definitions ) -> ServiceUnit :
57- """
58- Returns the type of service unit, the count, and the determining resource
59- """
60- su_type = SU_UNKNOWN
61- su_count = 0
62-
63- # pods that requested a specific GPU but weren't scheduled may report 0 GPU
64- if self .gpu_resource is not None and self .gpu_request == 0 :
65- return ServiceUnit (SU_UNKNOWN_GPU , 0 , "GPU" )
66-
67- # pods in weird states
68- if self .cpu_request == 0 or self .memory_request == 0 :
69- return ServiceUnit (SU_UNKNOWN , 0 , "CPU" )
70-
71- known_gpu_su = {
72- GPU_A100 : SU_A100_GPU ,
73- GPU_A100_SXM4 : SU_A100_SXM4_GPU ,
74- GPU_V100 : SU_V100_GPU ,
75- GPU_H100 : SU_H100_GPU ,
76- }
77-
78- A100_SXM4_MIG = {
79- MIG_1G_5GB : SU_UNKNOWN_MIG_GPU ,
80- MIG_2G_10GB : SU_UNKNOWN_MIG_GPU ,
81- MIG_3G_20GB : SU_UNKNOWN_MIG_GPU ,
82- }
83-
84- if self .gpu_resource is None and self .gpu_request == 0 :
85- su_type = SU_CPU
86- elif self .gpu_type is not None and self .gpu_resource == WHOLE_GPU :
87- su_type = known_gpu_su .get (self .gpu_type , SU_UNKNOWN_GPU )
88- elif self .gpu_resource == VM_GPU_A100_SXM4 :
89- su_type = SU_A100_SXM4_GPU
90- elif self .gpu_resource == VM_GPU_H100 :
91- su_type = SU_H100_GPU
92- elif self .gpu_resource == VM_GPU_V100 :
93- su_type = SU_V100_GPU
94- elif self .gpu_type == GPU_A100_SXM4 : # for MIG GPU of type A100_SXM4
95- su_type = A100_SXM4_MIG .get (self .gpu_resource , SU_UNKNOWN_MIG_GPU )
96- else :
97- return ServiceUnit (SU_UNKNOWN_GPU , 0 , "GPU" )
98-
99- cpu_multiplier = self .cpu_request / int (su_definitions [su_type ]["vCPUs" ])
100- memory_multiplier = self .memory_request / int (
101- (int (su_definitions [su_type ]["RAM" ]) / 1024 )
102- )
103- if int (su_definitions [su_type ]["GPUs" ]) != 0 :
104- gpu_multiplier = self .gpu_request / int (su_definitions [su_type ]["GPUs" ])
105- else :
106- gpu_multiplier = 0
107-
108- su_count = max (cpu_multiplier , gpu_multiplier , memory_multiplier )
109-
110- # no fractional SUs for GPU SUs
111- if su_type != SU_CPU :
112- su_count = math .ceil (su_count )
113-
114- if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier :
115- determining_resource = "GPU"
116- elif cpu_multiplier >= gpu_multiplier and cpu_multiplier >= memory_multiplier :
117- determining_resource = "CPU"
118- else :
119- determining_resource = "RAM"
120-
121- return ServiceUnit (su_type , su_count , determining_resource )
18+ size_gib : Decimal
12219
12320 def get_runtime (
12421 self , ignore_times : List [Tuple [datetime .datetime , datetime .datetime ]] = None
@@ -145,22 +42,18 @@ def get_runtime(
14542 def end_time (self ) -> int :
14643 return self .start_time + self .duration
14744
148- def generate_pod_row (self , ignore_times , su_definitions ):
45+ def generate_pvc_row (self , ignore_times ):
14946 """
15047 This returns a row to represent pod data.
15148 It converts the epoch_time stamps to datetime timestamps so it's more readable.
15249 Additionally, some metrics are rounded for readibility.
15350 """
154- su_type , su_count , determining_resource = self .get_service_unit (su_definitions )
15551 start_time = datetime .datetime .fromtimestamp (
15652 self .start_time , datetime .UTC
15753 ).strftime ("%Y-%m-%dT%H:%M:%S" )
15854 end_time = datetime .datetime .fromtimestamp (
15955 self .end_time , datetime .UTC
16056 ).strftime ("%Y-%m-%dT%H:%M:%S" )
161- memory_request = self .memory_request .quantize (
162- Decimal (".0001" ), rounding = ROUND_HALF_UP
163- )
16457 runtime = self .get_runtime (ignore_times ).quantize (
16558 Decimal (".0001" ), rounding = ROUND_HALF_UP
16659 )
@@ -169,29 +62,10 @@ def generate_pod_row(self, ignore_times, su_definitions):
16962 start_time ,
17063 end_time ,
17164 runtime ,
172- self .pod_name ,
173- self .cpu_request ,
174- self .gpu_request ,
175- self .gpu_type ,
176- self .gpu_resource ,
177- self .node_hostname ,
178- self .node_model ,
179- memory_request ,
180- determining_resource ,
181- su_type ,
182- su_count ,
65+ self .persistent_volume_claim ,
66+ self .size_gib ,
18367 ]
18468
185-
186- @dataclass ()
187- class Rates :
188- cpu : Decimal
189- gpu_a100 : Decimal
190- gpu_a100sxm4 : Decimal
191- gpu_v100 : Decimal
192- gpu_h100 : Decimal
193-
194-
19569@dataclass
19670class ProjectInvoce :
19771 """Represents the invoicing data for a project."""
@@ -205,48 +79,25 @@ class ProjectInvoce:
20579 invoice_address : str
20680 intitution : str
20781 institution_specific_code : str
208- rates : Rates
209- su_definitions : dict
82+ rate : Decimal
21083 ignore_hours : Optional [List [Tuple [datetime .datetime , datetime .datetime ]]] = None
21184 su_hours : dict = field (
21285 default_factory = lambda : {
213- SU_CPU : 0 ,
214- SU_A100_GPU : 0 ,
215- SU_A100_SXM4_GPU : 0 ,
216- SU_V100_GPU : 0 ,
217- SU_H100_GPU : 0 ,
218- SU_UNKNOWN_GPU : 0 ,
219- SU_UNKNOWN_MIG_GPU : 0 ,
220- SU_UNKNOWN : 0 ,
86+ SU_STORAGE : 0 ,
22187 }
22288 )
22389
224- def add_pod (self , pod : Pod ) -> None :
90+ def add_pvc (self , pvc : PVC ) -> None :
22591 """Aggregate a pods data"""
226- su_type , su_count , _ = pod .get_service_unit (self .su_definitions )
227- duration_in_hours = pod .get_runtime (self .ignore_hours )
228- self .su_hours [su_type ] += su_count * duration_in_hours
229-
230- def get_rate (self , su_type ) -> Decimal :
231- if su_type == SU_CPU :
232- return self .rates .cpu
233- if su_type == SU_A100_GPU :
234- return self .rates .gpu_a100
235- if su_type == SU_A100_SXM4_GPU :
236- return self .rates .gpu_a100sxm4
237- if su_type == SU_V100_GPU :
238- return self .rates .gpu_v100
239- if su_type == SU_H100_GPU :
240- return self .rates .gpu_h100
241- return Decimal (0 )
92+ duration_in_hours = pvc .get_runtime (self .ignore_hours )
93+ self .su_hours [SU_STORAGE ] += pvc .size_gib * duration_in_hours
24294
24395 def generate_invoice_rows (self , report_month ) -> List [str ]:
24496 rows = []
24597 for su_type , hours in self .su_hours .items ():
24698 if hours > 0 :
24799 hours = math .ceil (hours )
248- rate = self .get_rate (su_type )
249- cost = (rate * hours ).quantize (Decimal (".01" ), rounding = ROUND_HALF_UP )
100+ cost = (self .rate * hours ).quantize (Decimal (".01" ), rounding = ROUND_HALF_UP )
250101 row = [
251102 report_month ,
252103 self .project ,
@@ -259,7 +110,7 @@ def generate_invoice_rows(self, report_month) -> List[str]:
259110 self .institution_specific_code ,
260111 hours ,
261112 su_type ,
262- rate ,
113+ self . rate ,
263114 cost ,
264115 ]
265116 rows .append (row )
0 commit comments