From fbff4fde2bfa5732cd6c6f30f1dc2b116013469e Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Mon, 10 Nov 2025 22:18:30 -0500 Subject: [PATCH 1/6] This refactors merge.py into smaller functions so it's easier to read what's going on --- openshift_metrics/merge.py | 175 +++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 73 deletions(-) diff --git a/openshift_metrics/merge.py b/openshift_metrics/merge.py index 0c29364..01ed5a0 100644 --- a/openshift_metrics/merge.py +++ b/openshift_metrics/merge.py @@ -6,8 +6,9 @@ import logging import argparse from datetime import datetime, UTC +from dataclasses import dataclass import json -from typing import Tuple +from typing import Tuple, List from decimal import Decimal from nerc_rates import rates, outages @@ -19,7 +20,15 @@ logger = logging.getLogger(__name__) -def compare_dates(date_str1, date_str2): +@dataclass +class MetricsMetadata: + cluster_name: str + report_start_date: str | None + report_end_date: str | None + interval_minutes: int + + +def compare_dates(date_str1, date_str2) -> bool: """Returns true is date1 is earlier than date2""" date1 = datetime.strptime(date_str1, "%Y-%m-%d") date2 = datetime.strptime(date_str2, "%Y-%m-%d") @@ -67,6 +76,81 @@ def get_su_definitions(report_month) -> dict: return su_definitions +def load_and_merge_metrics(interval_minutes, files: List[str]) -> MetricsProcessor: + """Load and merge metrics + + Loads metrics from provided json files and then returns a processor + that has all the merged data. + """ + processor = MetricsProcessor(interval_minutes) + for file in files: + with open(file, "r") as jsonfile: + metrics_from_file = json.load(jsonfile) + cpu_request_metrics = metrics_from_file["cpu_metrics"] + memory_request_metrics = metrics_from_file["memory_metrics"] + gpu_request_metrics = metrics_from_file.get("gpu_metrics", None) + processor.merge_metrics("cpu_request", cpu_request_metrics) + processor.merge_metrics("memory_request", memory_request_metrics) + if gpu_request_metrics is not None: + processor.merge_metrics("gpu_request", gpu_request_metrics) + return processor + + +def load_metadata(files: List[str]) -> MetricsMetadata: + """ + Load only the metadata from the metrics files. + """ + cluster_name = None + report_start_date = None + report_end_date = None + interval_minutes = None + + for file in files: + with open(file, "r") as jsonfile: + metrics_from_file = json.load(jsonfile) + if cluster_name is None: + cluster_name = metrics_from_file.get("cluster_name") + + if interval_minutes is None: + interval_minutes = metrics_from_file.get("interval_minutes") + else: + interval_minutes_from_file = metrics_from_file["interval_minutes"] + if interval_minutes != interval_minutes_from_file: + sys.exit( + f"Cannot process files with different intervals {interval_minutes} != {interval_minutes_from_file}" + ) + + if report_start_date is None: + report_start_date = metrics_from_file["start_date"] + elif compare_dates(metrics_from_file["start_date"], report_start_date): + report_start_date = metrics_from_file["start_date"] + + if report_end_date is None: + report_end_date = metrics_from_file["end_date"] + elif compare_dates(report_end_date, metrics_from_file["end_date"]): + report_end_date = metrics_from_file["end_date"] + + if cluster_name is None: + cluster_name = "Unknown Cluster" + + if interval_minutes is None: + logger.info( + f"No prometheus query interval minutes found in the given set of files. Using the provided interval: {PROM_QUERY_INTERVAL_MINUTES} minute(s)" + ) + interval_minutes = PROM_QUERY_INTERVAL_MINUTES + else: + logger.info( + f"Prometheus Query interval set to {interval_minutes} minute(s) from file" + ) + + return MetricsMetadata( + cluster_name=cluster_name, + report_start_date=report_start_date, + report_end_date=report_end_date, + interval_minutes=interval_minutes, + ) + + def main(): """Reads the metrics from files and generates the reports""" parser = argparse.ArgumentParser() @@ -104,60 +188,14 @@ def main(): args = parser.parse_args() files = args.files - report_start_date = None - report_end_date = None - cluster_name = None - interval_minutes = None - - for file in files: - with open(file, "r") as jsonfile: - metrics_from_file = json.load(jsonfile) - if interval_minutes is None: - interval_minutes = metrics_from_file.get("interval_minutes") - else: - interval_minutes_from_file = metrics_from_file["interval_minutes"] - if interval_minutes != interval_minutes_from_file: - sys.exit( - f"Cannot process files with different intervals {interval_minutes} != {interval_minutes_from_file}" - ) - - if interval_minutes is None: - logger.info( - f"No prometheus query interval minutes found in the given set of files. Using the provided interval: {PROM_QUERY_INTERVAL_MINUTES} minute(s)" - ) - interval_minutes = PROM_QUERY_INTERVAL_MINUTES - else: - logger.info( - f"Prometheus Query interval set to {interval_minutes} minute(s) from file" - ) - - processor = MetricsProcessor(interval_minutes) - - for file in files: - with open(file, "r") as jsonfile: - metrics_from_file = json.load(jsonfile) - if cluster_name is None: - cluster_name = metrics_from_file.get("cluster_name") - cpu_request_metrics = metrics_from_file["cpu_metrics"] - memory_request_metrics = metrics_from_file["memory_metrics"] - gpu_request_metrics = metrics_from_file.get("gpu_metrics", None) - processor.merge_metrics("cpu_request", cpu_request_metrics) - processor.merge_metrics("memory_request", memory_request_metrics) - if gpu_request_metrics is not None: - processor.merge_metrics("gpu_request", gpu_request_metrics) - - if report_start_date is None: - report_start_date = metrics_from_file["start_date"] - elif compare_dates(metrics_from_file["start_date"], report_start_date): - report_start_date = metrics_from_file["start_date"] + metrics_metadata = load_metadata(files) - if report_end_date is None: - report_end_date = metrics_from_file["end_date"] - elif compare_dates(report_end_date, metrics_from_file["end_date"]): - report_end_date = metrics_from_file["end_date"] + cluster_name = metrics_metadata.cluster_name + report_start_date = metrics_metadata.report_start_date + report_end_date = metrics_metadata.report_end_date + interval_minutes = metrics_metadata.interval_minutes - if cluster_name is None: - cluster_name = "Unknown Cluster" + processor = load_and_merge_metrics(interval_minutes, files) logger.info( f"Generating report from {report_start_date} to {report_end_date} for {cluster_name}" @@ -171,13 +209,13 @@ def main(): logger.info("Using nerc rates for rates and outages") rates_data = rates.load_from_url() invoice_rates = invoice.Rates( - cpu=rates_data.get_value_at("CPU SU Rate", report_month, Decimal), - gpu_a100=rates_data.get_value_at("GPUA100 SU Rate", report_month, Decimal), - gpu_a100sxm4=rates_data.get_value_at( + cpu=rates_data.get_value_at("CPU SU Rate", report_month, Decimal), # type: ignore + gpu_a100=rates_data.get_value_at("GPUA100 SU Rate", report_month, Decimal), # type: ignore + gpu_a100sxm4=rates_data.get_value_at( # type: ignore "GPUA100SXM4 SU Rate", report_month, Decimal ), - gpu_v100=rates_data.get_value_at("GPUV100 SU Rate", report_month, Decimal), - gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal), + gpu_v100=rates_data.get_value_at("GPUV100 SU Rate", report_month, Decimal), # type: ignore + gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal), # type: ignore ) outage_data = outages.load_from_url() ignore_hours = outage_data.get_outages_during( @@ -197,20 +235,11 @@ def main(): for start_time, end_time in ignore_hours: logger.info(f"{start_time} to {end_time} will be excluded from the invoice") - if args.invoice_file: - invoice_file = args.invoice_file - else: - invoice_file = f"NERC OpenShift {report_month}.csv" - - if args.class_invoice_file: - class_invoice_file = args.class_invoice_file - else: - class_invoice_file = f"NERC OpenShift Classes {report_month}.csv" - - if args.pod_report_file: - pod_report_file = args.pod_report_file - else: - pod_report_file = f"Pod NERC OpenShift {report_month}.csv" + invoice_file = args.invoice_file or f"NERC OpenShift {report_month}.csv" + class_invoice_file = ( + args.class_invoice_file or f"NERC OpenShift Classes {report_month}.csv" + ) + pod_report_file = args.pod_report_file or f"Pod NERC OpenShift {report_month}.csv" report_start_date = datetime.strptime(report_start_date, "%Y-%m-%d") report_end_date = datetime.strptime(report_end_date, "%Y-%m-%d") From 73997cf417f88e94b8bdaa414ff34c048598cc5f Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Mon, 10 Nov 2025 22:30:37 -0500 Subject: [PATCH 2/6] Change the default names to be the ones we use for s3 --- openshift_metrics/merge.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/openshift_metrics/merge.py b/openshift_metrics/merge.py index 01ed5a0..c2f600d 100644 --- a/openshift_metrics/merge.py +++ b/openshift_metrics/merge.py @@ -157,15 +157,15 @@ def main(): parser.add_argument("files", nargs="+") parser.add_argument( "--invoice-file", - help="Name of the invoice file. Defaults to NERC OpenShift .csv", + help="Name of the invoice file. Defaults to .csv", ) parser.add_argument( "--pod-report-file", - help="Name of the pod report file. Defaults to Pod NERC OpenShift .csv", + help="Name of the pod report file. Defaults to Pod- .csv", ) parser.add_argument( "--class-invoice-file", - help="Name of the class report file. Defaults to NERC OpenShift Class .csv", + help=f"Name of the class report file. Defaults to Classes- .csv", ) parser.add_argument("--upload-to-s3", action="store_true") parser.add_argument( @@ -235,11 +235,11 @@ def main(): for start_time, end_time in ignore_hours: logger.info(f"{start_time} to {end_time} will be excluded from the invoice") - invoice_file = args.invoice_file or f"NERC OpenShift {report_month}.csv" + invoice_file = args.invoice_file or f"{cluster_name} {report_month}.csv" class_invoice_file = ( - args.class_invoice_file or f"NERC OpenShift Classes {report_month}.csv" + args.class_invoice_file or f"Classes-{cluster_name} {report_month}.csv" ) - pod_report_file = args.pod_report_file or f"Pod NERC OpenShift {report_month}.csv" + pod_report_file = args.pod_report_file or f"Pod-{cluster_name} {report_month}.csv" report_start_date = datetime.strptime(report_start_date, "%Y-%m-%d") report_end_date = datetime.strptime(report_end_date, "%Y-%m-%d") From 35028fa617a8cfba0bff01c0bf27085922fbfd10 Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Wed, 12 Nov 2025 14:47:15 -0500 Subject: [PATCH 3/6] Add tests for merge.py Add tests for some of the important functions in merge.py. Since I wrote the tests for pytest, I switched to using pytest as the runner for the rest of the tests. --- .github/workflows/unit-tests.yaml | 2 +- openshift_metrics/tests/conftest.py | 127 ++++++++++++++++++++++++++ openshift_metrics/tests/test_merge.py | 98 ++++++++++++++++++++ requirements.txt | 2 + 4 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 openshift_metrics/tests/conftest.py create mode 100644 openshift_metrics/tests/test_merge.py diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 879ca5e..771d297 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -22,4 +22,4 @@ jobs: - name: Run unit tests run: | - python -m unittest openshift_metrics/tests/test_* + pytest openshift_metrics/tests/ diff --git a/openshift_metrics/tests/conftest.py b/openshift_metrics/tests/conftest.py new file mode 100644 index 0000000..13eb1b0 --- /dev/null +++ b/openshift_metrics/tests/conftest.py @@ -0,0 +1,127 @@ +import pytest + + +@pytest.fixture +def mock_metrics_file1(): + cpu_metrics = [ + { + "metric": { + "pod": "pod1", + "namespace": "namespace1", + "resource": "cpu", + }, + "values": [ + [0, 10], + [60, 15], + [120, 20], + ], + }, + { + "metric": { + "pod": "pod2", + "namespace": "namespace1", + "resource": "cpu", + }, + "values": [ + [0, 30], + [60, 35], + [120, 40], + ], + }, + ] + memory_metrics = [ + { + "metric": { + "pod": "pod1", + "namespace": "namespace1", + "resource": "memory", + }, + "values": [ + [0, 10], + [60, 15], + [120, 20], + ], + }, + { + "metric": { + "pod": "pod2", + "namespace": "namespace1", + "resource": "cpu", + }, + "values": [ + [0, 30], + [60, 35], + [120, 40], + ], + }, + ] + return { + "cluster_name": "ocp-prod", + "start_date": "2025-09-20", + "end_date": "2025-09-20", + "cpu_metrics": cpu_metrics, + "memory_metrics": memory_metrics, + } + + +@pytest.fixture +def mock_metrics_file2(): + cpu_metrics = [ + { + "metric": { + "pod": "pod1", + "namespace": "namespace1", + "resource": "cpu", + }, + "values": [ + [180, 10], + [240, 15], + [300, 20], + ], + }, + { + "metric": { + "pod": "pod2", + "namespace": "namespace1", + "resource": "cpu", + }, + "values": [ + [180, 30], + [240, 35], + [300, 40], + ], + }, + ] + memory_metrics = [ + { + "metric": { + "pod": "pod1", + "namespace": "namespace1", + "resource": "memory", + }, + "values": [ + [180, 10], + [240, 15], + [300, 20], + ], + }, + { + "metric": { + "pod": "pod2", + "namespace": "namespace1", + "resource": "cpu", + }, + "values": [ + [180, 30], + [240, 35], + [300, 40], + ], + }, + ] + return { + "cluster_name": "ocp-prod", + "start_date": "2025-09-21", + "end_date": "2025-09-21", + "cpu_metrics": cpu_metrics, + "memory_metrics": memory_metrics, + } diff --git a/openshift_metrics/tests/test_merge.py b/openshift_metrics/tests/test_merge.py new file mode 100644 index 0000000..d6c7b81 --- /dev/null +++ b/openshift_metrics/tests/test_merge.py @@ -0,0 +1,98 @@ +import pytest +import json +from decimal import Decimal + +from openshift_metrics.merge import ( + compare_dates, + get_su_definitions, + load_and_merge_metrics, + load_metadata, +) + + +@pytest.mark.parametrize( + "date1, date2, expected_result", + [ + ("2025-01-18", "2025-01-20", True), + ("2025-01-18", "2025-01-16", False), + ("2025-01-18", "2025-01-18", False), + ], +) +def test_compare_dates(date1, date2, expected_result): + assert compare_dates(date1, date2) is expected_result + + +def test_get_su_definitions(mocker): + mock_rates = { + "vCPUs in GPUV100 SU": Decimal("20"), + "RAM in GPUV100 SU": Decimal("8192"), + "GPUs in GPUV100 SU": Decimal("1"), + "vCPUs in CPU SU": Decimal("5"), + "RAM in CPU SU": Decimal("1024"), + "GPUs in CPU SU": Decimal("0"), + } + mock_rates_data = mocker.MagicMock() + + def mock_get_value_at(key, month, value_type): + return mock_rates.get(key, Decimal("67")) + + mock_rates_data.get_value_at.side_effect = mock_get_value_at + mocker.patch( + "openshift_metrics.merge.rates.load_from_url", return_value=mock_rates_data + ) + report_month = "2025-10" + su_definitions = get_su_definitions(report_month) + + assert "OpenShift GPUV100" in su_definitions + assert su_definitions["OpenShift GPUV100"]["vCPUs"] == Decimal("20") + assert su_definitions["OpenShift GPUV100"]["RAM"] == Decimal("8192") + assert su_definitions["OpenShift GPUV100"]["GPUs"] == Decimal("1") + + assert "OpenShift CPU" in su_definitions + assert su_definitions["OpenShift CPU"]["vCPUs"] == Decimal("5") + assert su_definitions["OpenShift CPU"]["RAM"] == Decimal("1024") + assert su_definitions["OpenShift CPU"]["GPUs"] == Decimal("0") + + # This should get the default test value + assert su_definitions["OpenShift GPUH100"]["GPUs"] == Decimal("67") + + +def test_load_and_merge_data(tmp_path, mock_metrics_file1, mock_metrics_file2): + """ + Test that we can load metrics from the 2 files and merge the metrics from those. + + Note that we already have tests that test the merging of the data, this mostly + focuses on the loading part. + """ + p1 = tmp_path / "file1.json" + p2 = tmp_path / "file2.json" + + p1.write_text(json.dumps(mock_metrics_file1)) + p2.write_text(json.dumps(mock_metrics_file2)) + + processor = load_and_merge_metrics([p1, p2]) + + pod1_metrics = processor.merged_data["namespace1"]["pod1"]["metrics"] + + # check values from file1.json are in the merged_data + assert 60 in pod1_metrics # 60 is the epoch time stamp + assert pod1_metrics[60]["cpu_request"] == 15 + assert pod1_metrics[60]["memory_request"] == 15 + + # check values from file2.json are in the merged_data + assert 180 in pod1_metrics + assert pod1_metrics[180]["cpu_request"] == 10 + assert pod1_metrics[180]["memory_request"] == 10 + + +def test_load_metadata(tmp_path, mock_metrics_file1, mock_metrics_file2): + """Test we can load metadata from the metrics files.""" + + p1 = tmp_path / "file1.json" + p2 = tmp_path / "file2.json" + p1.write_text(json.dumps(mock_metrics_file1)) + p2.write_text(json.dumps(mock_metrics_file2)) + metadata = load_metadata([p1, p2]) + assert metadata.cluster_name == "ocp-prod" + assert metadata.report_start_date == "2025-09-20" + assert metadata.report_end_date == "2025-09-21" diff --git a/requirements.txt b/requirements.txt index 4c0d061..c5519a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ requests>=2.18.4 boto3<1.36 https://github.com/CCI-MOC/nerc-rates/archive/main.zip +pytest +pytest-mock From 29ef88113e4cfb8bb91a4f6d940f10f4a1c8286c Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Wed, 12 Nov 2025 15:05:02 -0500 Subject: [PATCH 4/6] Fix formatting issue --- openshift_metrics/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openshift_metrics/merge.py b/openshift_metrics/merge.py index c2f600d..c7f9b49 100644 --- a/openshift_metrics/merge.py +++ b/openshift_metrics/merge.py @@ -165,7 +165,7 @@ def main(): ) parser.add_argument( "--class-invoice-file", - help=f"Name of the class report file. Defaults to Classes- .csv", + help="Name of the class report file. Defaults to Classes- .csv", ) parser.add_argument("--upload-to-s3", action="store_true") parser.add_argument( From 6d219b42718b2ba0d639cf010baa45aefe31d768 Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Mon, 17 Nov 2025 17:04:55 -0500 Subject: [PATCH 5/6] Update tests to work with setting interval_minutes --- openshift_metrics/tests/conftest.py | 16 ++++++++++++++++ openshift_metrics/tests/test_merge.py | 15 ++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/openshift_metrics/tests/conftest.py b/openshift_metrics/tests/conftest.py index 13eb1b0..a717c5f 100644 --- a/openshift_metrics/tests/conftest.py +++ b/openshift_metrics/tests/conftest.py @@ -59,6 +59,7 @@ def mock_metrics_file1(): "cluster_name": "ocp-prod", "start_date": "2025-09-20", "end_date": "2025-09-20", + "interval_minutes": "15", "cpu_metrics": cpu_metrics, "memory_metrics": memory_metrics, } @@ -124,4 +125,19 @@ def mock_metrics_file2(): "end_date": "2025-09-21", "cpu_metrics": cpu_metrics, "memory_metrics": memory_metrics, + "interval_minutes": "15", + } + + +@pytest.fixture +def mock_metrics_file3(): + cpu_metrics = [] + memory_metrics = [] + return { + "cluster_name": "ocp-prod", + "start_date": "2025-09-21", + "end_date": "2025-09-21", + "interval_minutes": "3", # file1 and file2 have 15 minutes + "cpu_metrics": cpu_metrics, + "memory_metrics": memory_metrics, } diff --git a/openshift_metrics/tests/test_merge.py b/openshift_metrics/tests/test_merge.py index d6c7b81..91f20aa 100644 --- a/openshift_metrics/tests/test_merge.py +++ b/openshift_metrics/tests/test_merge.py @@ -70,7 +70,7 @@ def test_load_and_merge_data(tmp_path, mock_metrics_file1, mock_metrics_file2): p1.write_text(json.dumps(mock_metrics_file1)) p2.write_text(json.dumps(mock_metrics_file2)) - processor = load_and_merge_metrics([p1, p2]) + processor = load_and_merge_metrics(2, [p1, p2]) pod1_metrics = processor.merged_data["namespace1"]["pod1"]["metrics"] @@ -96,3 +96,16 @@ def test_load_metadata(tmp_path, mock_metrics_file1, mock_metrics_file2): assert metadata.cluster_name == "ocp-prod" assert metadata.report_start_date == "2025-09-20" assert metadata.report_end_date == "2025-09-21" + assert metadata.interval_minutes == "15" + + +def test_load_metadata_failure(tmp_path, mock_metrics_file2, mock_metrics_file3): + """Test that loading metadata fails when files have different interval_minutes.""" + + p2 = tmp_path / "file2.json" + p3 = tmp_path / "file3.json" + p2.write_text(json.dumps(mock_metrics_file2)) + p3.write_text(json.dumps(mock_metrics_file3)) + + with pytest.raises(SystemExit): + load_metadata([p2, p3]) From 717ddb22ac50a64d984ecb9523f3643edf95f974 Mon Sep 17 00:00:00 2001 From: Naved Ansari Date: Tue, 18 Nov 2025 11:51:09 -0500 Subject: [PATCH 6/6] Remove vscode specific linting directives --- openshift_metrics/merge.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openshift_metrics/merge.py b/openshift_metrics/merge.py index c7f9b49..3bfa4ef 100644 --- a/openshift_metrics/merge.py +++ b/openshift_metrics/merge.py @@ -209,13 +209,13 @@ def main(): logger.info("Using nerc rates for rates and outages") rates_data = rates.load_from_url() invoice_rates = invoice.Rates( - cpu=rates_data.get_value_at("CPU SU Rate", report_month, Decimal), # type: ignore - gpu_a100=rates_data.get_value_at("GPUA100 SU Rate", report_month, Decimal), # type: ignore - gpu_a100sxm4=rates_data.get_value_at( # type: ignore + cpu=rates_data.get_value_at("CPU SU Rate", report_month, Decimal), + gpu_a100=rates_data.get_value_at("GPUA100 SU Rate", report_month, Decimal), + gpu_a100sxm4=rates_data.get_value_at( "GPUA100SXM4 SU Rate", report_month, Decimal ), - gpu_v100=rates_data.get_value_at("GPUV100 SU Rate", report_month, Decimal), # type: ignore - gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal), # type: ignore + gpu_v100=rates_data.get_value_at("GPUV100 SU Rate", report_month, Decimal), + gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal), ) outage_data = outages.load_from_url() ignore_hours = outage_data.get_outages_during(