Skip to content

Commit 9f1b0be

Browse files
committed
Consolidated config into constants, config with env files.
1 parent 440a773 commit 9f1b0be

File tree

10 files changed

+414
-103
lines changed

10 files changed

+414
-103
lines changed

openshift_metrics/.env.example

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# =============================================================================
2+
# OPENSHIFT METRICS CONFIGURATION
3+
# =============================================================================
4+
# This file contains all configuration options for the openshift metrics system.
5+
# All variables are lexicographically identical to the python variables in config.py
6+
#
7+
# TO USE: Copy this file to .env in the openshift_metrics directory
8+
# Command: cp env_config.txt .env
9+
# Then modify the values as needed for your environment.
10+
11+
# =============================================================================
12+
# INFRASTRUCTURE CONFIGURATION
13+
# =============================================================================
14+
15+
# OpenShift/Prometheus Configuration
16+
OPENSHIFT_PROMETHEUS_URL=https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org
17+
OPENSHIFT_TOKEN=your_openshift_token_here
18+
19+
# S3 Configuration
20+
S3_OUTPUT_ENDPOINT_URL=https://s3.us-east-005.backblazeb2.com
21+
S3_OUTPUT_ACCESS_KEY_ID=your_s3_access_key_here
22+
S3_OUTPUT_SECRET_ACCESS_KEY=your_s3_secret_key_here
23+
S3_INVOICE_BUCKET=nerc-invoicing
24+
S3_METRICS_BUCKET=openshift_metrics
25+
26+
# =============================================================================
27+
# PROCESSING CONFIGURATION
28+
# =============================================================================
29+
30+
# Metrics processing intervals
31+
INTERVAL_MINUTES=15
32+
STEP_MINUTES=15
33+
GPU_MAPPING_FILE=gpu_node_map.json
34+
35+
# HTTP retry configuration
36+
HTTP_RETRY_TOTAL=3
37+
HTTP_RETRY_BACKOFF_FACTOR=1
38+
39+
# =============================================================================
40+
# REPORT CONFIGURATION (formerly CLI arguments)
41+
# =============================================================================
42+
43+
# Report dates (leave empty to use defaults - yesterday)
44+
REPORT_START_DATE=
45+
REPORT_END_DATE=
46+
47+
# Upload configuration
48+
UPLOAD_TO_S3=false
49+
50+
# File configuration (leave empty to use default naming patterns)
51+
OUTPUT_FILE=
52+
INVOICE_FILE=
53+
POD_REPORT_FILE=
54+
CLASS_INVOICE_FILE=
55+
56+
# Ignore hours configuration (comma-separated timestamp ranges)
57+
# Format: YYYY-MM-DDTHH:MM:SS,YYYY-MM-DDTHH:MM:SS
58+
IGNORE_HOURS=
59+
60+
# =============================================================================
61+
# RATES AND BILLING CONFIGURATION
62+
# =============================================================================
63+
64+
# Rate source configuration
65+
USE_NERC_RATES=false
66+
67+
# Individual rates (Decimal values)
68+
RATE_CPU_SU=0.013
69+
RATE_GPU_V100_SU=1.214
70+
RATE_GPU_A100SXM4_SU=2.078
71+
RATE_GPU_A100_SU=1.803
72+
RATE_GPU_H100_SU=6.04
73+
74+
# Legacy rate (for backward compatibility)
75+
GPU_A100_RATE=1.803
76+
77+
# =============================================================================
78+
# BUSINESS LOGIC CONFIGURATION
79+
# =============================================================================
80+
81+
# Namespaces that support class-based reporting (comma-separated)
82+
NAMESPACES_WITH_CLASSES=rhods-notebooks
83+
84+
# =============================================================================
85+
# GPU CONFIGURATION
86+
# =============================================================================
87+
88+
# GPU types
89+
GPU_A100=NVIDIA-A100-40GB
90+
GPU_A100_SXM4=NVIDIA-A100-SXM4-40GB
91+
GPU_V100=Tesla-V100-PCIE-32GB
92+
GPU_H100=NVIDIA-H100-80GB-HBM3
93+
94+
# GPU Resource - MIG Geometries
95+
MIG_1G_5GB=nvidia.com/mig-1g.5gb
96+
MIG_2G_10GB=nvidia.com/mig-2g.10gb
97+
MIG_3G_20GB=nvidia.com/mig-3g.20gb
98+
WHOLE_GPU=nvidia.com/gpu
99+
100+
# VM GPU Resources
101+
VM_GPU_H100=nvidia.com/H100_SXM5_80GB
102+
VM_GPU_A100_SXM4=nvidia.com/A100_SXM4_40GB
103+
VM_GPU_V100=nvidia.com/GV100GL_Tesla_V100

openshift_metrics/config.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""
2+
Config for the openshift metrics.
3+
All values are set in the .env file
4+
All variables in the .env file are lexicographically identical to the python variables below
5+
"""
6+
7+
import os
8+
from datetime import datetime, timedelta
9+
10+
# =============================================================================
11+
# HARDCODED CONSTANTS (rarely change, application-specific)
12+
# =============================================================================
13+
14+
# Prometheus query strings
15+
PROMETHEUS_QUERIES = {
16+
"CPU_REQUEST": 'kube_pod_resource_request{resource="cpu", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
17+
"MEMORY_REQUEST": 'kube_pod_resource_request{resource="memory", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
18+
"GPU_REQUEST": 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
19+
"KUBE_NODE_LABELS": 'kube_node_labels{label_nvidia_com_gpu_product!=""}',
20+
"KUBE_POD_LABELS": 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}',
21+
}
22+
23+
# Cluster name mappings
24+
CLUSTER_NAME_MAPPING = {
25+
"https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org": "ocp-prod",
26+
"https://thanos-querier-openshift-monitoring.apps.ocp-test.nerc.mghpcc.org": "ocp-test",
27+
"https://thanos-querier-openshift-monitoring.apps.edu.nerc.mghpcc.org": "academic",
28+
}
29+
30+
# Default values for empty fields
31+
DEFAULT_VALUES = {
32+
"UNKNOWN_NODE": "Unknown Node",
33+
"UNKNOWN_MODEL": "Unknown Model",
34+
"EMPTY_STRING": "",
35+
}
36+
37+
# =============================================================================
38+
# BUSINESS LOGIC CONSTANTS
39+
# =============================================================================
40+
# Note: Business logic constants (GPU types, SU types, etc.) are now in constants.py
41+
# This file only contains truly configurable values that change between deployments
42+
43+
# =============================================================================
44+
# INFRASTRUCTURE CONFIGURATION
45+
# =============================================================================
46+
47+
# OpenShift/Prometheus
48+
OPENSHIFT_PROMETHEUS_URL = os.getenv("OPENSHIFT_PROMETHEUS_URL")
49+
OPENSHIFT_TOKEN = os.getenv("OPENSHIFT_TOKEN")
50+
51+
# S3 Configuration
52+
S3_ENDPOINT_URL = os.getenv(
53+
"S3_OUTPUT_ENDPOINT_URL", "https://s3.us-east-005.backblazeb2.com"
54+
)
55+
S3_ACCESS_KEY_ID = os.getenv("S3_OUTPUT_ACCESS_KEY_ID")
56+
S3_SECRET_ACCESS_KEY = os.getenv("S3_OUTPUT_SECRET_ACCESS_KEY")
57+
S3_INVOICE_BUCKET = os.getenv("S3_INVOICE_BUCKET", "nerc-invoicing")
58+
S3_METRICS_BUCKET = os.getenv("S3_METRICS_BUCKET", "openshift_metrics")
59+
60+
# =============================================================================
61+
# PROCESSING CONFIGURATION
62+
# =============================================================================
63+
64+
# Metrics processing
65+
INTERVAL_MINUTES = int(os.getenv("INTERVAL_MINUTES", "15"))
66+
STEP_MINUTES = int(os.getenv("STEP_MINUTES", "15"))
67+
GPU_MAPPING_FILE = os.getenv("GPU_MAPPING_FILE", "gpu_node_map.json")
68+
69+
# HTTP retry configuration
70+
HTTP_RETRY_CONFIG = {
71+
"total": int(os.getenv("HTTP_RETRY_TOTAL", "3")),
72+
"backoff_factor": int(os.getenv("HTTP_RETRY_BACKOFF_FACTOR", "1")),
73+
"status_forcelist": [429, 500, 502, 503, 504],
74+
}
75+
76+
# =============================================================================
77+
# REPORT CONFIGURATION (formerly CLI arguments)
78+
# =============================================================================
79+
80+
# Report dates (with defaults)
81+
REPORT_START_DATE = os.getenv(
82+
"REPORT_START_DATE", (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
83+
)
84+
REPORT_END_DATE = os.getenv(
85+
"REPORT_END_DATE", (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
86+
)
87+
88+
# Upload configuration
89+
UPLOAD_TO_S3 = os.getenv("UPLOAD_TO_S3", "false").lower() == "true"
90+
91+
# File configuration
92+
OUTPUT_FILE = os.getenv("OUTPUT_FILE")
93+
INVOICE_FILE = os.getenv("INVOICE_FILE")
94+
POD_REPORT_FILE = os.getenv("POD_REPORT_FILE")
95+
CLASS_INVOICE_FILE = os.getenv("CLASS_INVOICE_FILE")
96+
97+
# Ignore hours configuration (comma-separated timestamp ranges)
98+
IGNORE_HOURS = os.getenv("IGNORE_HOURS", "")
99+
100+
# =============================================================================
101+
# RATES AND BILLING CONFIGURATION
102+
# =============================================================================
103+
104+
# Rate source configuration
105+
USE_NERC_RATES = os.getenv("USE_NERC_RATES", "false").lower() == "true"
106+
107+
# Individual rates (Decimal values)
108+
RATE_CPU_SU = os.getenv("RATE_CPU_SU")
109+
RATE_GPU_V100_SU = os.getenv("RATE_GPU_V100_SU")
110+
RATE_GPU_A100SXM4_SU = os.getenv("RATE_GPU_A100SXM4_SU")
111+
RATE_GPU_A100_SU = os.getenv("RATE_GPU_A100_SU")
112+
RATE_GPU_H100_SU = os.getenv("RATE_GPU_H100_SU")
113+
114+
# Legacy rates dictionary (for backward compatibility)
115+
# Note: This would need to import constants if used, but it's marked as legacy
116+
RATES = {
117+
# "NVIDIA-A100-40GB": Decimal(os.getenv("GPU_A100_RATE")) if os.getenv("GPU_A100_RATE") else None,
118+
}
119+
120+
# =============================================================================
121+
# BUSINESS LOGIC CONFIGURATION
122+
# =============================================================================
123+
124+
# Namespaces that support class-based reporting
125+
NAMESPACES_WITH_CLASSES = os.getenv("NAMESPACES_WITH_CLASSES", "rhods-notebooks").split(
126+
","
127+
)
128+
129+
# Default filename patterns
130+
DEFAULT_FILENAME_PATTERNS = {
131+
"INVOICE_FILE": "NERC OpenShift {report_month}.csv",
132+
"POD_REPORT_FILE": "Pod NERC OpenShift {report_month}.csv",
133+
"CLASS_INVOICE_FILE": "NERC OpenShift Classes {report_month}.csv",
134+
"OUTPUT_FILE_SINGLE": "metrics-{report_date}.json",
135+
"OUTPUT_FILE_RANGE": "metrics-{start_date}-to-{end_date}.json",
136+
}

openshift_metrics/constants.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Business logic constants for the openshift metrics system.
3+
4+
These are fixed constants that define the business logic and don't change between deployments.
5+
For configurable values, see config.py
6+
"""
7+
8+
# =============================================================================
9+
# GPU TYPES
10+
# =============================================================================
11+
12+
GPU_A100 = "NVIDIA-A100-40GB"
13+
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
14+
GPU_V100 = "Tesla-V100-PCIE-32GB"
15+
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
16+
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
17+
18+
# =============================================================================
19+
# GPU RESOURCE - MIG GEOMETRIES
20+
# =============================================================================
21+
22+
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
23+
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
24+
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
25+
WHOLE_GPU = "nvidia.com/gpu"
26+
27+
# =============================================================================
28+
# VM GPU RESOURCES
29+
# =============================================================================
30+
31+
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
32+
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
33+
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
34+
35+
# =============================================================================
36+
# SERVICE UNIT TYPES
37+
# =============================================================================
38+
39+
SU_CPU = "OpenShift CPU"
40+
SU_A100_GPU = "OpenShift GPUA100"
41+
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
42+
SU_V100_GPU = "OpenShift GPUV100"
43+
SU_H100_GPU = "OpenShift GPUH100"
44+
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
45+
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
46+
SU_UNKNOWN = "Openshift Unknown"

openshift_metrics/invoice.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,35 @@
55
from decimal import Decimal, ROUND_HALF_UP
66
import datetime
77

8-
# GPU types
9-
GPU_A100 = "NVIDIA-A100-40GB"
10-
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
11-
GPU_V100 = "Tesla-V100-PCIE-32GB"
12-
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
13-
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
8+
from openshift_metrics import constants
9+
10+
# Import constants from centralized constants module
11+
GPU_A100 = constants.GPU_A100
12+
GPU_A100_SXM4 = constants.GPU_A100_SXM4
13+
GPU_V100 = constants.GPU_V100
14+
GPU_H100 = constants.GPU_H100
15+
GPU_UNKNOWN_TYPE = constants.GPU_UNKNOWN_TYPE
1416

1517
# GPU Resource - MIG Geometries
16-
# A100 Strategies
17-
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
18-
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
19-
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
20-
WHOLE_GPU = "nvidia.com/gpu"
18+
MIG_1G_5GB = constants.MIG_1G_5GB
19+
MIG_2G_10GB = constants.MIG_2G_10GB
20+
MIG_3G_20GB = constants.MIG_3G_20GB
21+
WHOLE_GPU = constants.WHOLE_GPU
2122

2223
# VM GPU Resources
23-
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
24-
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
25-
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
24+
VM_GPU_H100 = constants.VM_GPU_H100
25+
VM_GPU_A100_SXM4 = constants.VM_GPU_A100_SXM4
26+
VM_GPU_V100 = constants.VM_GPU_V100
2627

2728
# SU Types
28-
SU_CPU = "OpenShift CPU"
29-
SU_A100_GPU = "OpenShift GPUA100"
30-
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
31-
SU_V100_GPU = "OpenShift GPUV100"
32-
SU_H100_GPU = "OpenShift GPUH100"
33-
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
34-
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
35-
SU_UNKNOWN = "Openshift Unknown"
29+
SU_CPU = constants.SU_CPU
30+
SU_A100_GPU = constants.SU_A100_GPU
31+
SU_A100_SXM4_GPU = constants.SU_A100_SXM4_GPU
32+
SU_V100_GPU = constants.SU_V100_GPU
33+
SU_H100_GPU = constants.SU_H100_GPU
34+
SU_UNKNOWN_GPU = constants.SU_UNKNOWN_GPU
35+
SU_UNKNOWN_MIG_GPU = constants.SU_UNKNOWN_MIG_GPU
36+
SU_UNKNOWN = constants.SU_UNKNOWN
3637

3738
ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])
3839

0 commit comments

Comments
 (0)