Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ jobs:

- name: Run unit tests
run: |
python -m unittest openshift_metrics/tests/test_*
pytest openshift_metrics/tests/
181 changes: 105 additions & 76 deletions openshift_metrics/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import logging
import argparse
from datetime import datetime, UTC
from dataclasses import dataclass
import json
from typing import Tuple
from typing import Tuple, List
from decimal import Decimal
from nerc_rates import rates, outages

Expand All @@ -19,7 +20,15 @@
logger = logging.getLogger(__name__)


def compare_dates(date_str1, date_str2):
@dataclass
class MetricsMetadata:
cluster_name: str
report_start_date: str | None
report_end_date: str | None
interval_minutes: int


def compare_dates(date_str1, date_str2) -> bool:
"""Returns true is date1 is earlier than date2"""
date1 = datetime.strptime(date_str1, "%Y-%m-%d")
date2 = datetime.strptime(date_str2, "%Y-%m-%d")
Expand Down Expand Up @@ -67,21 +76,96 @@ def get_su_definitions(report_month) -> dict:
return su_definitions


def load_and_merge_metrics(interval_minutes, files: List[str]) -> MetricsProcessor:
"""Load and merge metrics

Loads metrics from provided json files and then returns a processor
that has all the merged data.
"""
processor = MetricsProcessor(interval_minutes)
for file in files:
with open(file, "r") as jsonfile:
metrics_from_file = json.load(jsonfile)
cpu_request_metrics = metrics_from_file["cpu_metrics"]
memory_request_metrics = metrics_from_file["memory_metrics"]
gpu_request_metrics = metrics_from_file.get("gpu_metrics", None)
processor.merge_metrics("cpu_request", cpu_request_metrics)
processor.merge_metrics("memory_request", memory_request_metrics)
if gpu_request_metrics is not None:
processor.merge_metrics("gpu_request", gpu_request_metrics)
Comment on lines +87 to +95
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor suggestion, but I think this is more concise:

Suggested change
with open(file, "r") as jsonfile:
metrics_from_file = json.load(jsonfile)
cpu_request_metrics = metrics_from_file["cpu_metrics"]
memory_request_metrics = metrics_from_file["memory_metrics"]
gpu_request_metrics = metrics_from_file.get("gpu_metrics", None)
processor.merge_metrics("cpu_request", cpu_request_metrics)
processor.merge_metrics("memory_request", memory_request_metrics)
if gpu_request_metrics is not None:
processor.merge_metrics("gpu_request", gpu_request_metrics)
for resource in ["cpu_metrics", "memory_metrics", "gpu_metrics"]:
if resource == "gpu_metrics":
if gpu_request_metrics := metrics_from_file.get(resource):
processor.merge_metrics(resource, gpu_request_metrics)
else:
request_metrics = metrics_from_file[resource]
processor.merge_metrics(resource, request_metrics)

If cpu_metrics and memory_metrics is always present, is it fine to make the loop even simpler?

for resource in ["cpu_metrics", "memory_metrics", "gpu_metrics"]:
    if request_metrics := metrics_from_file.get(resource):
        processor.merge_metrics(resource, request_metrics)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good suggestion

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uh, while this is a good suggestion, due to the old clumsy naming of things it'll break stuff. See, the files put things in cpu_metrics but the processors call it cpu_request so our neat little loop won't work. I am going to leave this as is.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah sorry, the strings looked similar and I thought they were the same

return processor


def load_metadata(files: List[str]) -> MetricsMetadata:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I could load data and metadata in a single loop instead of loading files twice. And for that I reason I don't like what I've done. I am going to refactor it again later.

"""
Load only the metadata from the metrics files.
"""
cluster_name = None
report_start_date = None
report_end_date = None
interval_minutes = None

for file in files:
with open(file, "r") as jsonfile:
metrics_from_file = json.load(jsonfile)
if cluster_name is None:
cluster_name = metrics_from_file.get("cluster_name")
Comment on lines +111 to +112
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any concern that cluster_name is not being checked? What if the provided files are from different clusters. It seems this behavior has been in the code prior to this refactoring, but wanted to ask just in case

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we checked that cluster_name could be different in different files, so this behavior is unchanged. But it doesn't hurt to add that additional check. I'll add that in a different PR.


if interval_minutes is None:
interval_minutes = metrics_from_file.get("interval_minutes")
else:
interval_minutes_from_file = metrics_from_file["interval_minutes"]
if interval_minutes != interval_minutes_from_file:
sys.exit(
f"Cannot process files with different intervals {interval_minutes} != {interval_minutes_from_file}"
)

if report_start_date is None:
report_start_date = metrics_from_file["start_date"]
elif compare_dates(metrics_from_file["start_date"], report_start_date):
report_start_date = metrics_from_file["start_date"]

if report_end_date is None:
report_end_date = metrics_from_file["end_date"]
elif compare_dates(report_end_date, metrics_from_file["end_date"]):
report_end_date = metrics_from_file["end_date"]

if cluster_name is None:
cluster_name = "Unknown Cluster"

if interval_minutes is None:
logger.info(
f"No prometheus query interval minutes found in the given set of files. Using the provided interval: {PROM_QUERY_INTERVAL_MINUTES} minute(s)"
)
interval_minutes = PROM_QUERY_INTERVAL_MINUTES
else:
logger.info(
f"Prometheus Query interval set to {interval_minutes} minute(s) from file"
)

return MetricsMetadata(
cluster_name=cluster_name,
report_start_date=report_start_date,
report_end_date=report_end_date,
interval_minutes=interval_minutes,
)


def main():
"""Reads the metrics from files and generates the reports"""
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+")
parser.add_argument(
"--invoice-file",
help="Name of the invoice file. Defaults to NERC OpenShift <report_month>.csv",
help="Name of the invoice file. Defaults to <cluster-name> <report_month>.csv",
)
parser.add_argument(
"--pod-report-file",
help="Name of the pod report file. Defaults to Pod NERC OpenShift <report_month>.csv",
help="Name of the pod report file. Defaults to Pod-<cluster-name> <report_month>.csv",
)
parser.add_argument(
"--class-invoice-file",
help="Name of the class report file. Defaults to NERC OpenShift Class <report_month>.csv",
help="Name of the class report file. Defaults to Classes-<cluster-name> <report_month>.csv",
)
parser.add_argument("--upload-to-s3", action="store_true")
parser.add_argument(
Expand All @@ -104,60 +188,14 @@ def main():
args = parser.parse_args()
files = args.files

report_start_date = None
report_end_date = None
cluster_name = None
interval_minutes = None

for file in files:
with open(file, "r") as jsonfile:
metrics_from_file = json.load(jsonfile)
if interval_minutes is None:
interval_minutes = metrics_from_file.get("interval_minutes")
else:
interval_minutes_from_file = metrics_from_file["interval_minutes"]
if interval_minutes != interval_minutes_from_file:
sys.exit(
f"Cannot process files with different intervals {interval_minutes} != {interval_minutes_from_file}"
)

if interval_minutes is None:
logger.info(
f"No prometheus query interval minutes found in the given set of files. Using the provided interval: {PROM_QUERY_INTERVAL_MINUTES} minute(s)"
)
interval_minutes = PROM_QUERY_INTERVAL_MINUTES
else:
logger.info(
f"Prometheus Query interval set to {interval_minutes} minute(s) from file"
)

processor = MetricsProcessor(interval_minutes)

for file in files:
with open(file, "r") as jsonfile:
metrics_from_file = json.load(jsonfile)
if cluster_name is None:
cluster_name = metrics_from_file.get("cluster_name")
cpu_request_metrics = metrics_from_file["cpu_metrics"]
memory_request_metrics = metrics_from_file["memory_metrics"]
gpu_request_metrics = metrics_from_file.get("gpu_metrics", None)
processor.merge_metrics("cpu_request", cpu_request_metrics)
processor.merge_metrics("memory_request", memory_request_metrics)
if gpu_request_metrics is not None:
processor.merge_metrics("gpu_request", gpu_request_metrics)

if report_start_date is None:
report_start_date = metrics_from_file["start_date"]
elif compare_dates(metrics_from_file["start_date"], report_start_date):
report_start_date = metrics_from_file["start_date"]
metrics_metadata = load_metadata(files)

if report_end_date is None:
report_end_date = metrics_from_file["end_date"]
elif compare_dates(report_end_date, metrics_from_file["end_date"]):
report_end_date = metrics_from_file["end_date"]
cluster_name = metrics_metadata.cluster_name
report_start_date = metrics_metadata.report_start_date
report_end_date = metrics_metadata.report_end_date
interval_minutes = metrics_metadata.interval_minutes

if cluster_name is None:
cluster_name = "Unknown Cluster"
processor = load_and_merge_metrics(interval_minutes, files)

logger.info(
f"Generating report from {report_start_date} to {report_end_date} for {cluster_name}"
Expand All @@ -171,13 +209,13 @@ def main():
logger.info("Using nerc rates for rates and outages")
rates_data = rates.load_from_url()
invoice_rates = invoice.Rates(
cpu=rates_data.get_value_at("CPU SU Rate", report_month, Decimal),
gpu_a100=rates_data.get_value_at("GPUA100 SU Rate", report_month, Decimal),
gpu_a100sxm4=rates_data.get_value_at(
cpu=rates_data.get_value_at("CPU SU Rate", report_month, Decimal), # type: ignore
gpu_a100=rates_data.get_value_at("GPUA100 SU Rate", report_month, Decimal), # type: ignore
gpu_a100sxm4=rates_data.get_value_at( # type: ignore
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you using a linter additional to the ruff that we use in the CI? I didn't got any pre-commit errors when removing these comments

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was vscode yelling at me so I put these, but I am going to remove these.

"GPUA100SXM4 SU Rate", report_month, Decimal
),
gpu_v100=rates_data.get_value_at("GPUV100 SU Rate", report_month, Decimal),
gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal),
gpu_v100=rates_data.get_value_at("GPUV100 SU Rate", report_month, Decimal), # type: ignore
gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal), # type: ignore
)
outage_data = outages.load_from_url()
ignore_hours = outage_data.get_outages_during(
Expand All @@ -197,20 +235,11 @@ def main():
for start_time, end_time in ignore_hours:
logger.info(f"{start_time} to {end_time} will be excluded from the invoice")

if args.invoice_file:
invoice_file = args.invoice_file
else:
invoice_file = f"NERC OpenShift {report_month}.csv"

if args.class_invoice_file:
class_invoice_file = args.class_invoice_file
else:
class_invoice_file = f"NERC OpenShift Classes {report_month}.csv"

if args.pod_report_file:
pod_report_file = args.pod_report_file
else:
pod_report_file = f"Pod NERC OpenShift {report_month}.csv"
invoice_file = args.invoice_file or f"{cluster_name} {report_month}.csv"
class_invoice_file = (
args.class_invoice_file or f"Classes-{cluster_name} {report_month}.csv"
)
pod_report_file = args.pod_report_file or f"Pod-{cluster_name} {report_month}.csv"

report_start_date = datetime.strptime(report_start_date, "%Y-%m-%d")
report_end_date = datetime.strptime(report_end_date, "%Y-%m-%d")
Expand Down
143 changes: 143 additions & 0 deletions openshift_metrics/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import pytest


@pytest.fixture
def mock_metrics_file1():
cpu_metrics = [
{
"metric": {
"pod": "pod1",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[0, 10],
[60, 15],
[120, 20],
],
},
{
"metric": {
"pod": "pod2",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[0, 30],
[60, 35],
[120, 40],
],
},
]
memory_metrics = [
{
"metric": {
"pod": "pod1",
"namespace": "namespace1",
"resource": "memory",
},
"values": [
[0, 10],
[60, 15],
[120, 20],
],
},
{
"metric": {
"pod": "pod2",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[0, 30],
[60, 35],
[120, 40],
],
},
]
return {
"cluster_name": "ocp-prod",
"start_date": "2025-09-20",
"end_date": "2025-09-20",
"interval_minutes": "15",
"cpu_metrics": cpu_metrics,
"memory_metrics": memory_metrics,
}


@pytest.fixture
def mock_metrics_file2():
cpu_metrics = [
{
"metric": {
"pod": "pod1",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[180, 10],
[240, 15],
[300, 20],
],
},
{
"metric": {
"pod": "pod2",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[180, 30],
[240, 35],
[300, 40],
],
},
]
memory_metrics = [
{
"metric": {
"pod": "pod1",
"namespace": "namespace1",
"resource": "memory",
},
"values": [
[180, 10],
[240, 15],
[300, 20],
],
},
{
"metric": {
"pod": "pod2",
"namespace": "namespace1",
"resource": "cpu",
},
"values": [
[180, 30],
[240, 35],
[300, 40],
],
},
]
return {
"cluster_name": "ocp-prod",
"start_date": "2025-09-21",
"end_date": "2025-09-21",
"cpu_metrics": cpu_metrics,
"memory_metrics": memory_metrics,
"interval_minutes": "15",
}


@pytest.fixture
def mock_metrics_file3():
cpu_metrics = []
memory_metrics = []
return {
"cluster_name": "ocp-prod",
"start_date": "2025-09-21",
"end_date": "2025-09-21",
"interval_minutes": "3", # file1 and file2 have 15 minutes
"cpu_metrics": cpu_metrics,
"memory_metrics": memory_metrics,
}
Loading