Skip to content

Commit 8c862a7

Browse files
committed
WIP
1 parent 358a01b commit 8c862a7

File tree

7 files changed

+118
-443
lines changed

7 files changed

+118
-443
lines changed

gpu_node_map.json

Lines changed: 0 additions & 20 deletions
This file was deleted.

openshift_metrics/invoice.py

Lines changed: 16 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -5,120 +5,17 @@
55
from decimal import Decimal, ROUND_HALF_UP
66
import datetime
77

8-
# GPU types
9-
GPU_A100 = "NVIDIA-A100-40GB"
10-
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
11-
GPU_V100 = "Tesla-V100-PCIE-32GB"
12-
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
13-
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
14-
15-
# GPU Resource - MIG Geometries
16-
# A100 Strategies
17-
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
18-
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
19-
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
20-
WHOLE_GPU = "nvidia.com/gpu"
21-
22-
# VM GPU Resources
23-
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
24-
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
25-
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
26-
27-
# SU Types
28-
SU_CPU = "OpenShift CPU"
29-
SU_A100_GPU = "OpenShift GPUA100"
30-
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
31-
SU_V100_GPU = "OpenShift GPUV100"
32-
SU_H100_GPU = "OpenShift GPUH100"
33-
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
34-
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
35-
SU_UNKNOWN = "Openshift Unknown"
36-
37-
ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])
38-
8+
SU_STORAGE = "OpenShift Storage"
399

4010
@dataclass
41-
class Pod:
42-
"""Object that represents a pod"""
43-
44-
pod_name: str
11+
class PVC:
12+
"""an object that represents a pvc"""
13+
volume: str
14+
persistent_volume_claim: str
4515
namespace: str
4616
start_time: int
4717
duration: int
48-
cpu_request: Decimal
49-
gpu_request: Decimal
50-
memory_request: Decimal
51-
gpu_type: str
52-
gpu_resource: str
53-
node_hostname: str
54-
node_model: str
55-
56-
def get_service_unit(self, su_definitions) -> ServiceUnit:
57-
"""
58-
Returns the type of service unit, the count, and the determining resource
59-
"""
60-
su_type = SU_UNKNOWN
61-
su_count = 0
62-
63-
# pods that requested a specific GPU but weren't scheduled may report 0 GPU
64-
if self.gpu_resource is not None and self.gpu_request == 0:
65-
return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")
66-
67-
# pods in weird states
68-
if self.cpu_request == 0 or self.memory_request == 0:
69-
return ServiceUnit(SU_UNKNOWN, 0, "CPU")
70-
71-
known_gpu_su = {
72-
GPU_A100: SU_A100_GPU,
73-
GPU_A100_SXM4: SU_A100_SXM4_GPU,
74-
GPU_V100: SU_V100_GPU,
75-
GPU_H100: SU_H100_GPU,
76-
}
77-
78-
A100_SXM4_MIG = {
79-
MIG_1G_5GB: SU_UNKNOWN_MIG_GPU,
80-
MIG_2G_10GB: SU_UNKNOWN_MIG_GPU,
81-
MIG_3G_20GB: SU_UNKNOWN_MIG_GPU,
82-
}
83-
84-
if self.gpu_resource is None and self.gpu_request == 0:
85-
su_type = SU_CPU
86-
elif self.gpu_type is not None and self.gpu_resource == WHOLE_GPU:
87-
su_type = known_gpu_su.get(self.gpu_type, SU_UNKNOWN_GPU)
88-
elif self.gpu_resource == VM_GPU_A100_SXM4:
89-
su_type = SU_A100_SXM4_GPU
90-
elif self.gpu_resource == VM_GPU_H100:
91-
su_type = SU_H100_GPU
92-
elif self.gpu_resource == VM_GPU_V100:
93-
su_type = SU_V100_GPU
94-
elif self.gpu_type == GPU_A100_SXM4: # for MIG GPU of type A100_SXM4
95-
su_type = A100_SXM4_MIG.get(self.gpu_resource, SU_UNKNOWN_MIG_GPU)
96-
else:
97-
return ServiceUnit(SU_UNKNOWN_GPU, 0, "GPU")
98-
99-
cpu_multiplier = self.cpu_request / int(su_definitions[su_type]["vCPUs"])
100-
memory_multiplier = self.memory_request / int(
101-
(int(su_definitions[su_type]["RAM"]) / 1024)
102-
)
103-
if int(su_definitions[su_type]["GPUs"]) != 0:
104-
gpu_multiplier = self.gpu_request / int(su_definitions[su_type]["GPUs"])
105-
else:
106-
gpu_multiplier = 0
107-
108-
su_count = max(cpu_multiplier, gpu_multiplier, memory_multiplier)
109-
110-
# no fractional SUs for GPU SUs
111-
if su_type != SU_CPU:
112-
su_count = math.ceil(su_count)
113-
114-
if gpu_multiplier >= cpu_multiplier and gpu_multiplier >= memory_multiplier:
115-
determining_resource = "GPU"
116-
elif cpu_multiplier >= gpu_multiplier and cpu_multiplier >= memory_multiplier:
117-
determining_resource = "CPU"
118-
else:
119-
determining_resource = "RAM"
120-
121-
return ServiceUnit(su_type, su_count, determining_resource)
18+
size_gib: Decimal
12219

12320
def get_runtime(
12421
self, ignore_times: List[Tuple[datetime.datetime, datetime.datetime]] = None
@@ -145,22 +42,18 @@ def get_runtime(
14542
def end_time(self) -> int:
14643
return self.start_time + self.duration
14744

148-
def generate_pod_row(self, ignore_times, su_definitions):
45+
def generate_pvc_row(self, ignore_times):
14946
"""
15047
This returns a row to represent pod data.
15148
It converts the epoch_time stamps to datetime timestamps so it's more readable.
15249
Additionally, some metrics are rounded for readibility.
15350
"""
154-
su_type, su_count, determining_resource = self.get_service_unit(su_definitions)
15551
start_time = datetime.datetime.fromtimestamp(
15652
self.start_time, datetime.UTC
15753
).strftime("%Y-%m-%dT%H:%M:%S")
15854
end_time = datetime.datetime.fromtimestamp(
15955
self.end_time, datetime.UTC
16056
).strftime("%Y-%m-%dT%H:%M:%S")
161-
memory_request = self.memory_request.quantize(
162-
Decimal(".0001"), rounding=ROUND_HALF_UP
163-
)
16457
runtime = self.get_runtime(ignore_times).quantize(
16558
Decimal(".0001"), rounding=ROUND_HALF_UP
16659
)
@@ -169,29 +62,10 @@ def generate_pod_row(self, ignore_times, su_definitions):
16962
start_time,
17063
end_time,
17164
runtime,
172-
self.pod_name,
173-
self.cpu_request,
174-
self.gpu_request,
175-
self.gpu_type,
176-
self.gpu_resource,
177-
self.node_hostname,
178-
self.node_model,
179-
memory_request,
180-
determining_resource,
181-
su_type,
182-
su_count,
65+
self.persistent_volume_claim,
66+
self.size_gib,
18367
]
18468

185-
186-
@dataclass()
187-
class Rates:
188-
cpu: Decimal
189-
gpu_a100: Decimal
190-
gpu_a100sxm4: Decimal
191-
gpu_v100: Decimal
192-
gpu_h100: Decimal
193-
194-
19569
@dataclass
19670
class ProjectInvoce:
19771
"""Represents the invoicing data for a project."""
@@ -205,48 +79,25 @@ class ProjectInvoce:
20579
invoice_address: str
20680
intitution: str
20781
institution_specific_code: str
208-
rates: Rates
209-
su_definitions: dict
82+
rate: Decimal
21083
ignore_hours: Optional[List[Tuple[datetime.datetime, datetime.datetime]]] = None
21184
su_hours: dict = field(
21285
default_factory=lambda: {
213-
SU_CPU: 0,
214-
SU_A100_GPU: 0,
215-
SU_A100_SXM4_GPU: 0,
216-
SU_V100_GPU: 0,
217-
SU_H100_GPU: 0,
218-
SU_UNKNOWN_GPU: 0,
219-
SU_UNKNOWN_MIG_GPU: 0,
220-
SU_UNKNOWN: 0,
86+
SU_STORAGE: 0,
22187
}
22288
)
22389

224-
def add_pod(self, pod: Pod) -> None:
90+
def add_pvc(self, pvc: PVC) -> None:
22591
"""Aggregate a pods data"""
226-
su_type, su_count, _ = pod.get_service_unit(self.su_definitions)
227-
duration_in_hours = pod.get_runtime(self.ignore_hours)
228-
self.su_hours[su_type] += su_count * duration_in_hours
229-
230-
def get_rate(self, su_type) -> Decimal:
231-
if su_type == SU_CPU:
232-
return self.rates.cpu
233-
if su_type == SU_A100_GPU:
234-
return self.rates.gpu_a100
235-
if su_type == SU_A100_SXM4_GPU:
236-
return self.rates.gpu_a100sxm4
237-
if su_type == SU_V100_GPU:
238-
return self.rates.gpu_v100
239-
if su_type == SU_H100_GPU:
240-
return self.rates.gpu_h100
241-
return Decimal(0)
92+
duration_in_hours = pvc.get_runtime(self.ignore_hours)
93+
self.su_hours[SU_STORAGE] += pvc.size_gib * duration_in_hours
24294

24395
def generate_invoice_rows(self, report_month) -> List[str]:
24496
rows = []
24597
for su_type, hours in self.su_hours.items():
24698
if hours > 0:
24799
hours = math.ceil(hours)
248-
rate = self.get_rate(su_type)
249-
cost = (rate * hours).quantize(Decimal(".01"), rounding=ROUND_HALF_UP)
100+
cost = (self.rate * hours).quantize(Decimal(".01"), rounding=ROUND_HALF_UP)
250101
row = [
251102
report_month,
252103
self.project,
@@ -259,7 +110,7 @@ def generate_invoice_rows(self, report_month) -> List[str]:
259110
self.institution_specific_code,
260111
hours,
261112
su_type,
262-
rate,
113+
self.rate,
263114
cost,
264115
]
265116
rows.append(row)

0 commit comments

Comments
 (0)