Skip to content

Commit 4480a0f

Browse files
committed
Consolidated config into constants, config with env files.
1 parent 440a773 commit 4480a0f

File tree

8 files changed

+220
-99
lines changed

8 files changed

+220
-99
lines changed

openshift_metrics/.env.example

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# =============================================================================
2+
# OPENSHIFT METRICS CONFIGURATION
3+
# =============================================================================
4+
# This file contains all configuration options for the openshift metrics system.
5+
# All variables are lexicographically identical to the python variables in config.py
6+
#
7+
# TO USE: Copy this file to .env in the openshift_metrics directory
8+
# Command: cp env_config.txt .env
9+
# Then modify the values as needed for your environment.
10+
11+
# =============================================================================
12+
# INFRASTRUCTURE CONFIGURATION
13+
# =============================================================================
14+
15+
# OpenShift/Prometheus Configuration
16+
OPENSHIFT_PROMETHEUS_URL=https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org
17+
OPENSHIFT_TOKEN=your_openshift_token_here
18+
19+
# S3 Configuration
20+
S3_OUTPUT_ENDPOINT_URL=https://s3.us-east-005.backblazeb2.com
21+
S3_OUTPUT_ACCESS_KEY_ID=your_s3_access_key_here
22+
S3_OUTPUT_SECRET_ACCESS_KEY=your_s3_secret_key_here
23+
S3_INVOICE_BUCKET=nerc-invoicing
24+
S3_METRICS_BUCKET=openshift_metrics
25+
26+
# =============================================================================
27+
# PROCESSING CONFIGURATION
28+
# =============================================================================
29+
30+
# Metrics processing intervals
31+
INTERVAL_MINUTES=15
32+
STEP_MINUTES=15
33+
GPU_MAPPING_FILE=gpu_node_map.json
34+
35+
# HTTP retry configuration
36+
HTTP_RETRY_TOTAL=3
37+
HTTP_RETRY_BACKOFF_FACTOR=1
38+
39+
# =============================================================================
40+
# REPORT CONFIGURATION (formerly CLI arguments)
41+
# =============================================================================
42+
43+
# Report dates (leave empty to use defaults - yesterday)
44+
REPORT_START_DATE=
45+
REPORT_END_DATE=
46+
47+
# Upload configuration
48+
UPLOAD_TO_S3=false
49+
50+
# File configuration (leave empty to use default naming patterns)
51+
OUTPUT_FILE=
52+
INVOICE_FILE=
53+
POD_REPORT_FILE=
54+
CLASS_INVOICE_FILE=
55+
56+
# Ignore hours configuration (comma-separated timestamp ranges)
57+
# Format: YYYY-MM-DDTHH:MM:SS,YYYY-MM-DDTHH:MM:SS
58+
IGNORE_HOURS=
59+
60+
# =============================================================================
61+
# RATES AND BILLING CONFIGURATION
62+
# =============================================================================
63+
64+
# Rate source configuration
65+
USE_NERC_RATES=false
66+
67+
# Individual rates (Decimal values)
68+
RATE_CPU_SU=0.013
69+
RATE_GPU_V100_SU=1.214
70+
RATE_GPU_A100SXM4_SU=2.078
71+
RATE_GPU_A100_SU=1.803
72+
RATE_GPU_H100_SU=6.04
73+
74+
# Legacy rate (for backward compatibility)
75+
GPU_A100_RATE=1.803
76+
77+
# =============================================================================
78+
# BUSINESS LOGIC CONFIGURATION
79+
# =============================================================================
80+
81+
# Namespaces that support class-based reporting (comma-separated)
82+
NAMESPACES_WITH_CLASSES=rhods-notebooks
83+
84+
# =============================================================================
85+
# GPU CONFIGURATION
86+
# =============================================================================
87+
88+
# GPU types
89+
GPU_A100=NVIDIA-A100-40GB
90+
GPU_A100_SXM4=NVIDIA-A100-SXM4-40GB
91+
GPU_V100=Tesla-V100-PCIE-32GB
92+
GPU_H100=NVIDIA-H100-80GB-HBM3
93+
94+
# GPU Resource - MIG Geometries
95+
MIG_1G_5GB=nvidia.com/mig-1g.5gb
96+
MIG_2G_10GB=nvidia.com/mig-2g.10gb
97+
MIG_3G_20GB=nvidia.com/mig-3g.20gb
98+
WHOLE_GPU=nvidia.com/gpu
99+
100+
# VM GPU Resources
101+
VM_GPU_H100=nvidia.com/H100_SXM5_80GB
102+
VM_GPU_A100_SXM4=nvidia.com/A100_SXM4_40GB
103+
VM_GPU_V100=nvidia.com/GV100GL_Tesla_V100

openshift_metrics/invoice.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,35 @@
55
from decimal import Decimal, ROUND_HALF_UP
66
import datetime
77

8-
# GPU types
9-
GPU_A100 = "NVIDIA-A100-40GB"
10-
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
11-
GPU_V100 = "Tesla-V100-PCIE-32GB"
12-
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
13-
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
8+
from openshift_metrics import config, constants
9+
10+
# Import constants from centralized constants module
11+
GPU_A100 = constants.GPU_A100
12+
GPU_A100_SXM4 = constants.GPU_A100_SXM4
13+
GPU_V100 = constants.GPU_V100
14+
GPU_H100 = constants.GPU_H100
15+
GPU_UNKNOWN_TYPE = constants.GPU_UNKNOWN_TYPE
1416

1517
# GPU Resource - MIG Geometries
16-
# A100 Strategies
17-
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
18-
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
19-
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
20-
WHOLE_GPU = "nvidia.com/gpu"
18+
MIG_1G_5GB = constants.MIG_1G_5GB
19+
MIG_2G_10GB = constants.MIG_2G_10GB
20+
MIG_3G_20GB = constants.MIG_3G_20GB
21+
WHOLE_GPU = constants.WHOLE_GPU
2122

2223
# VM GPU Resources
23-
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
24-
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
25-
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
24+
VM_GPU_H100 = constants.VM_GPU_H100
25+
VM_GPU_A100_SXM4 = constants.VM_GPU_A100_SXM4
26+
VM_GPU_V100 = constants.VM_GPU_V100
2627

2728
# SU Types
28-
SU_CPU = "OpenShift CPU"
29-
SU_A100_GPU = "OpenShift GPUA100"
30-
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
31-
SU_V100_GPU = "OpenShift GPUV100"
32-
SU_H100_GPU = "OpenShift GPUH100"
33-
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
34-
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
35-
SU_UNKNOWN = "Openshift Unknown"
29+
SU_CPU = constants.SU_CPU
30+
SU_A100_GPU = constants.SU_A100_GPU
31+
SU_A100_SXM4_GPU = constants.SU_A100_SXM4_GPU
32+
SU_V100_GPU = constants.SU_V100_GPU
33+
SU_H100_GPU = constants.SU_H100_GPU
34+
SU_UNKNOWN_GPU = constants.SU_UNKNOWN_GPU
35+
SU_UNKNOWN_MIG_GPU = constants.SU_UNKNOWN_MIG_GPU
36+
SU_UNKNOWN = constants.SU_UNKNOWN
3637

3738
ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])
3839

openshift_metrics/merge.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from decimal import Decimal
1212
import nerc_rates
1313

14-
from openshift_metrics import utils, invoice
14+
from openshift_metrics import utils, invoice, config, constants
1515
from openshift_metrics.metrics_processor import MetricsProcessor
1616

1717
logging.basicConfig(level=logging.INFO)
@@ -56,13 +56,13 @@ def get_su_definitions(report_month) -> dict:
5656
)
5757
)
5858
# Some internal SUs that I like to map to when there's insufficient data
59-
su_definitions[invoice.SU_UNKNOWN_GPU] = {"GPUs": 1, "vCPUs": 8, "RAM": 64 * 1024}
60-
su_definitions[invoice.SU_UNKNOWN_MIG_GPU] = {
59+
su_definitions[constants.SU_UNKNOWN_GPU] = {"GPUs": 1, "vCPUs": 8, "RAM": 64 * 1024}
60+
su_definitions[constants.SU_UNKNOWN_MIG_GPU] = {
6161
"GPUs": 1,
6262
"vCPUs": 8,
6363
"RAM": 64 * 1024,
6464
}
65-
su_definitions[invoice.SU_UNKNOWN] = {"GPUs": 0, "vCPUs": 1, "RAM": 1024}
65+
su_definitions[constants.SU_UNKNOWN] = {"GPUs": 0, "vCPUs": 1, "RAM": 1024}
6666
return su_definitions
6767

6868

@@ -73,16 +73,23 @@ def main():
7373
parser.add_argument(
7474
"--invoice-file",
7575
help="Name of the invoice file. Defaults to NERC OpenShift <report_month>.csv",
76+
default=config.INVOICE_FILE,
7677
)
7778
parser.add_argument(
7879
"--pod-report-file",
7980
help="Name of the pod report file. Defaults to Pod NERC OpenShift <report_month>.csv",
81+
default=config.POD_REPORT_FILE,
8082
)
8183
parser.add_argument(
8284
"--class-invoice-file",
8385
help="Name of the class report file. Defaults to NERC OpenShift Class <report_month>.csv",
86+
default=config.CLASS_INVOICE_FILE,
87+
)
88+
parser.add_argument(
89+
"--upload-to-s3",
90+
action="store_true",
91+
default=config.UPLOAD_TO_S3,
8492
)
85-
parser.add_argument("--upload-to-s3", action="store_true")
8693
parser.add_argument(
8794
"--ignore-hours",
8895
type=parse_timestamp_range,
@@ -93,12 +100,13 @@ def main():
93100
"--use-nerc-rates",
94101
action="store_true",
95102
help="Use rates from the nerc-rates repo",
103+
default=config.USE_NERC_RATES,
96104
)
97-
parser.add_argument("--rate-cpu-su", type=Decimal)
98-
parser.add_argument("--rate-gpu-v100-su", type=Decimal)
99-
parser.add_argument("--rate-gpu-a100sxm4-su", type=Decimal)
100-
parser.add_argument("--rate-gpu-a100-su", type=Decimal)
101-
parser.add_argument("--rate-gpu-h100-su", type=Decimal)
105+
parser.add_argument("--rate-cpu-su", type=Decimal, default=config.RATE_CPU_SU)
106+
parser.add_argument("--rate-gpu-v100-su", type=Decimal, default=config.RATE_GPU_V100_SU)
107+
parser.add_argument("--rate-gpu-a100sxm4-su", type=Decimal, default=config.RATE_GPU_A100SXM4_SU)
108+
parser.add_argument("--rate-gpu-a100-su", type=Decimal, default=config.RATE_GPU_A100_SU)
109+
parser.add_argument("--rate-gpu-h100-su", type=Decimal, default=config.RATE_GPU_H100_SU)
102110

103111
args = parser.parse_args()
104112
files = args.files
@@ -208,7 +216,7 @@ def main():
208216
rates=rates,
209217
su_definitions=su_definitions,
210218
cluster_name=cluster_name,
211-
namespaces_with_classes=["rhods-notebooks"],
219+
namespaces_with_classes=config.NAMESPACES_WITH_CLASSES,
212220
ignore_hours=ignore_hours,
213221
)
214222
utils.write_metrics_by_pod(
@@ -219,7 +227,7 @@ def main():
219227
)
220228

221229
if args.upload_to_s3:
222-
bucket_name = os.environ.get("S3_INVOICE_BUCKET", "nerc-invoicing")
230+
bucket_name = config.S3_INVOICE_BUCKET
223231
primary_location = (
224232
f"Invoices/{report_month}/"
225233
f"Service Invoices/{cluster_name} {report_month}.csv"

openshift_metrics/metrics_processor.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
from collections import namedtuple
44
import logging
55

6+
from openshift_metrics import config
7+
68
logging.basicConfig(level=logging.INFO)
79
logger = logging.getLogger(__name__)
810

9-
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
11+
# Import constants module
12+
from openshift_metrics import constants
13+
1014
GPUInfo = namedtuple("GPUInfo", ["gpu_type", "gpu_resource", "node_model"])
1115

1216

@@ -15,13 +19,13 @@ class MetricsProcessor:
1519

1620
def __init__(
1721
self,
18-
interval_minutes: int = 15,
22+
interval_minutes: int = None,
1923
merged_data: dict = None,
20-
gpu_mapping_file: str = "gpu_node_map.json",
24+
gpu_mapping_file: str = None,
2125
):
22-
self.interval_minutes = interval_minutes
26+
self.interval_minutes = interval_minutes or config.INTERVAL_MINUTES
2327
self.merged_data = merged_data if merged_data is not None else {}
24-
self.gpu_mapping = self._load_gpu_mapping(gpu_mapping_file)
28+
self.gpu_mapping = self._load_gpu_mapping(gpu_mapping_file or config.GPU_MAPPING_FILE)
2529

2630
def merge_metrics(self, metric_name, metric_list):
2731
"""Merge metrics (cpu, memory, gpu) by pod"""
@@ -75,16 +79,16 @@ def _extract_gpu_info(self, metric_name: str, metric: Dict) -> GPUInfo:
7579

7680
if metric_name == "gpu_request":
7781
gpu_type = metric["metric"].get(
78-
"label_nvidia_com_gpu_product", GPU_UNKNOWN_TYPE
82+
"label_nvidia_com_gpu_product", constants.GPU_UNKNOWN_TYPE
7983
)
8084
gpu_resource = metric["metric"].get("resource")
8185
node_model = metric["metric"].get("label_nvidia_com_gpu_machine")
8286

8387
# Sometimes GPU labels from the nodes can be missing, in that case
8488
# we get the gpu_type from the gpu-node file
85-
if gpu_type == GPU_UNKNOWN_TYPE:
89+
if gpu_type == constants.GPU_UNKNOWN_TYPE:
8690
node_name = metric["metric"].get("node")
87-
gpu_type = self.gpu_mapping.get(node_name, GPU_UNKNOWN_TYPE)
91+
gpu_type = self.gpu_mapping.get(node_name, constants.GPU_UNKNOWN_TYPE)
8892

8993
return GPUInfo(gpu_type, gpu_resource, node_model)
9094

openshift_metrics/openshift_prometheus_metrics.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,24 +20,21 @@
2020
import json
2121
import logging
2222

23-
from openshift_metrics import utils
23+
from openshift_metrics import utils, config
2424
from openshift_metrics.prometheus_client import PrometheusClient
2525
from openshift_metrics.metrics_processor import MetricsProcessor
2626

2727
logging.basicConfig(level=logging.INFO)
2828
logger = logging.getLogger(__name__)
2929

30-
CPU_REQUEST = 'kube_pod_resource_request{resource="cpu", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
31-
MEMORY_REQUEST = 'kube_pod_resource_request{resource="memory", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
32-
GPU_REQUEST = 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable'
33-
KUBE_NODE_LABELS = 'kube_node_labels{label_nvidia_com_gpu_product!=""}'
34-
KUBE_POD_LABELS = 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}'
30+
# Use centralized configuration for Prometheus queries and cluster mappings
31+
CPU_REQUEST = config.PROMETHEUS_QUERIES["CPU_REQUEST"]
32+
MEMORY_REQUEST = config.PROMETHEUS_QUERIES["MEMORY_REQUEST"]
33+
GPU_REQUEST = config.PROMETHEUS_QUERIES["GPU_REQUEST"]
34+
KUBE_NODE_LABELS = config.PROMETHEUS_QUERIES["KUBE_NODE_LABELS"]
35+
KUBE_POD_LABELS = config.PROMETHEUS_QUERIES["KUBE_POD_LABELS"]
3536

36-
URL_CLUSTER_NAME_MAPPING = {
37-
"https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org": "ocp-prod",
38-
"https://thanos-querier-openshift-monitoring.apps.ocp-test.nerc.mghpcc.org": "ocp-test",
39-
"https://thanos-querier-openshift-monitoring.apps.edu.nerc.mghpcc.org": "academic",
40-
}
37+
URL_CLUSTER_NAME_MAPPING = config.CLUSTER_NAME_MAPPING
4138

4239

4340
def main():
@@ -47,20 +44,27 @@ def main():
4744
parser.add_argument(
4845
"--openshift-url",
4946
help="OpenShift Prometheus URL",
50-
default=os.getenv("OPENSHIFT_PROMETHEUS_URL"),
47+
default=config.OPENSHIFT_PROMETHEUS_URL,
5148
)
5249
parser.add_argument(
5350
"--report-start-date",
5451
help="report date (ex: 2022-03-14)",
55-
default=(datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
52+
default=config.REPORT_START_DATE,
5653
)
5754
parser.add_argument(
5855
"--report-end-date",
5956
help="report date (ex: 2022-03-14)",
60-
default=(datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d"),
57+
default=config.REPORT_END_DATE,
58+
)
59+
parser.add_argument(
60+
"--upload-to-s3",
61+
action="store_true",
62+
default=config.UPLOAD_TO_S3,
63+
)
64+
parser.add_argument(
65+
"--output-file",
66+
default=config.OUTPUT_FILE,
6167
)
62-
parser.add_argument("--upload-to-s3", action="store_true")
63-
parser.add_argument("--output-file")
6468

6569
args = parser.parse_args()
6670
if not args.openshift_url:
@@ -88,7 +92,7 @@ def main():
8892
f"Generating report starting {report_start_date} and ending {report_end_date} in {output_file}"
8993
)
9094

91-
token = os.environ.get("OPENSHIFT_TOKEN")
95+
token = config.OPENSHIFT_TOKEN
9296
prom_client = PrometheusClient(openshift_url, token)
9397

9498
metrics_dict = {}
@@ -151,7 +155,7 @@ def main():
151155
json.dump(metrics_dict, file)
152156

153157
if args.upload_to_s3:
154-
bucket_name = os.environ.get("S3_METRICS_BUCKET", "openshift_metrics")
158+
bucket_name = config.S3_METRICS_BUCKET
155159
utils.upload_to_s3(output_file, bucket_name, s3_location)
156160

157161

0 commit comments

Comments
 (0)