Skip to content

Commit c472167

Browse files
committed
Consolidated config into constants, config with env files.
1 parent 440a773 commit c472167

File tree

10 files changed

+407
-103
lines changed

10 files changed

+407
-103
lines changed

openshift_metrics/.env.example

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# =============================================================================
2+
# OPENSHIFT METRICS CONFIGURATION
3+
# =============================================================================
4+
# This file contains all configuration options for the openshift metrics system.
5+
# All variables are lexicographically identical to the python variables in config.py
6+
#
7+
# TO USE: Copy this file to .env in the openshift_metrics directory
8+
# Command: cp env_config.txt .env
9+
# Then modify the values as needed for your environment.
10+
11+
# =============================================================================
12+
# INFRASTRUCTURE CONFIGURATION
13+
# =============================================================================
14+
15+
# OpenShift/Prometheus Configuration
16+
OPENSHIFT_PROMETHEUS_URL=https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org
17+
OPENSHIFT_TOKEN=your_openshift_token_here
18+
19+
# S3 Configuration
20+
S3_OUTPUT_ENDPOINT_URL=https://s3.us-east-005.backblazeb2.com
21+
S3_OUTPUT_ACCESS_KEY_ID=your_s3_access_key_here
22+
S3_OUTPUT_SECRET_ACCESS_KEY=your_s3_secret_key_here
23+
S3_INVOICE_BUCKET=nerc-invoicing
24+
S3_METRICS_BUCKET=openshift_metrics
25+
26+
# =============================================================================
27+
# PROCESSING CONFIGURATION
28+
# =============================================================================
29+
30+
# Metrics processing intervals
31+
INTERVAL_MINUTES=15
32+
STEP_MINUTES=15
33+
GPU_MAPPING_FILE=gpu_node_map.json
34+
35+
# HTTP retry configuration
36+
HTTP_RETRY_TOTAL=3
37+
HTTP_RETRY_BACKOFF_FACTOR=1
38+
39+
# =============================================================================
40+
# REPORT CONFIGURATION (formerly CLI arguments)
41+
# =============================================================================
42+
43+
# Report dates (leave empty to use defaults - yesterday)
44+
REPORT_START_DATE=
45+
REPORT_END_DATE=
46+
47+
# Upload configuration
48+
UPLOAD_TO_S3=false
49+
50+
# File configuration (leave empty to use default naming patterns)
51+
OUTPUT_FILE=
52+
INVOICE_FILE=
53+
POD_REPORT_FILE=
54+
CLASS_INVOICE_FILE=
55+
56+
# Ignore hours configuration (comma-separated timestamp ranges)
57+
# Format: YYYY-MM-DDTHH:MM:SS,YYYY-MM-DDTHH:MM:SS
58+
IGNORE_HOURS=
59+
60+
# =============================================================================
61+
# RATES AND BILLING CONFIGURATION
62+
# =============================================================================
63+
64+
# Rate source configuration
65+
USE_NERC_RATES=false
66+
67+
# Individual rates (Decimal values)
68+
RATE_CPU_SU=0.013
69+
RATE_GPU_V100_SU=1.214
70+
RATE_GPU_A100SXM4_SU=2.078
71+
RATE_GPU_A100_SU=1.803
72+
RATE_GPU_H100_SU=6.04
73+
74+
# Legacy rate (for backward compatibility)
75+
GPU_A100_RATE=1.803
76+
77+
# =============================================================================
78+
# BUSINESS LOGIC CONFIGURATION
79+
# =============================================================================
80+
81+
# Namespaces that support class-based reporting (comma-separated)
82+
NAMESPACES_WITH_CLASSES=rhods-notebooks
83+
84+
# =============================================================================
85+
# GPU CONFIGURATION
86+
# =============================================================================
87+
88+
# GPU types
89+
GPU_A100=NVIDIA-A100-40GB
90+
GPU_A100_SXM4=NVIDIA-A100-SXM4-40GB
91+
GPU_V100=Tesla-V100-PCIE-32GB
92+
GPU_H100=NVIDIA-H100-80GB-HBM3
93+
94+
# GPU Resource - MIG Geometries
95+
MIG_1G_5GB=nvidia.com/mig-1g.5gb
96+
MIG_2G_10GB=nvidia.com/mig-2g.10gb
97+
MIG_3G_20GB=nvidia.com/mig-3g.20gb
98+
WHOLE_GPU=nvidia.com/gpu
99+
100+
# VM GPU Resources
101+
VM_GPU_H100=nvidia.com/H100_SXM5_80GB
102+
VM_GPU_A100_SXM4=nvidia.com/A100_SXM4_40GB
103+
VM_GPU_V100=nvidia.com/GV100GL_Tesla_V100

openshift_metrics/config.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
Config for the openshift metrics.
3+
All values are set in the .env file
4+
All variables in the .env file are lexicographically identical to the python variables below
5+
"""
6+
import os
7+
from decimal import Decimal
8+
from datetime import datetime, timedelta
9+
from typing import Dict, List
10+
11+
# =============================================================================
12+
# HARDCODED CONSTANTS (rarely change, application-specific)
13+
# =============================================================================
14+
15+
# Prometheus query strings
16+
PROMETHEUS_QUERIES = {
17+
"CPU_REQUEST": 'kube_pod_resource_request{resource="cpu", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
18+
"MEMORY_REQUEST": 'kube_pod_resource_request{resource="memory", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
19+
"GPU_REQUEST": 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
20+
"KUBE_NODE_LABELS": 'kube_node_labels{label_nvidia_com_gpu_product!=""}',
21+
"KUBE_POD_LABELS": 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}',
22+
}
23+
24+
# Cluster name mappings
25+
CLUSTER_NAME_MAPPING = {
26+
"https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org": "ocp-prod",
27+
"https://thanos-querier-openshift-monitoring.apps.ocp-test.nerc.mghpcc.org": "ocp-test",
28+
"https://thanos-querier-openshift-monitoring.apps.edu.nerc.mghpcc.org": "academic",
29+
}
30+
31+
# Default values for empty fields
32+
DEFAULT_VALUES = {
33+
"UNKNOWN_NODE": "Unknown Node",
34+
"UNKNOWN_MODEL": "Unknown Model",
35+
"EMPTY_STRING": "",
36+
}
37+
38+
# =============================================================================
39+
# BUSINESS LOGIC CONSTANTS
40+
# =============================================================================
41+
# Note: Business logic constants (GPU types, SU types, etc.) are now in constants.py
42+
# This file only contains truly configurable values that change between deployments
43+
44+
# =============================================================================
45+
# INFRASTRUCTURE CONFIGURATION
46+
# =============================================================================
47+
48+
# OpenShift/Prometheus
49+
OPENSHIFT_PROMETHEUS_URL = os.getenv("OPENSHIFT_PROMETHEUS_URL")
50+
OPENSHIFT_TOKEN = os.getenv("OPENSHIFT_TOKEN")
51+
52+
# S3 Configuration
53+
S3_ENDPOINT_URL = os.getenv("S3_OUTPUT_ENDPOINT_URL", "https://s3.us-east-005.backblazeb2.com")
54+
S3_ACCESS_KEY_ID = os.getenv("S3_OUTPUT_ACCESS_KEY_ID")
55+
S3_SECRET_ACCESS_KEY = os.getenv("S3_OUTPUT_SECRET_ACCESS_KEY")
56+
S3_INVOICE_BUCKET = os.getenv("S3_INVOICE_BUCKET", "nerc-invoicing")
57+
S3_METRICS_BUCKET = os.getenv("S3_METRICS_BUCKET", "openshift_metrics")
58+
59+
# =============================================================================
60+
# PROCESSING CONFIGURATION
61+
# =============================================================================
62+
63+
# Metrics processing
64+
INTERVAL_MINUTES = int(os.getenv("INTERVAL_MINUTES", "15"))
65+
STEP_MINUTES = int(os.getenv("STEP_MINUTES", "15"))
66+
GPU_MAPPING_FILE = os.getenv("GPU_MAPPING_FILE", "gpu_node_map.json")
67+
68+
# HTTP retry configuration
69+
HTTP_RETRY_CONFIG = {
70+
"total": int(os.getenv("HTTP_RETRY_TOTAL", "3")),
71+
"backoff_factor": int(os.getenv("HTTP_RETRY_BACKOFF_FACTOR", "1")),
72+
"status_forcelist": [429, 500, 502, 503, 504],
73+
}
74+
75+
# =============================================================================
76+
# REPORT CONFIGURATION (formerly CLI arguments)
77+
# =============================================================================
78+
79+
# Report dates (with defaults)
80+
REPORT_START_DATE = os.getenv("REPORT_START_DATE", (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d"))
81+
REPORT_END_DATE = os.getenv("REPORT_END_DATE", (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d"))
82+
83+
# Upload configuration
84+
UPLOAD_TO_S3 = os.getenv("UPLOAD_TO_S3", "false").lower() == "true"
85+
86+
# File configuration
87+
OUTPUT_FILE = os.getenv("OUTPUT_FILE")
88+
INVOICE_FILE = os.getenv("INVOICE_FILE")
89+
POD_REPORT_FILE = os.getenv("POD_REPORT_FILE")
90+
CLASS_INVOICE_FILE = os.getenv("CLASS_INVOICE_FILE")
91+
92+
# Ignore hours configuration (comma-separated timestamp ranges)
93+
IGNORE_HOURS = os.getenv("IGNORE_HOURS", "")
94+
95+
# =============================================================================
96+
# RATES AND BILLING CONFIGURATION
97+
# =============================================================================
98+
99+
# Rate source configuration
100+
USE_NERC_RATES = os.getenv("USE_NERC_RATES", "false").lower() == "true"
101+
102+
# Individual rates (Decimal values)
103+
RATE_CPU_SU = os.getenv("RATE_CPU_SU")
104+
RATE_GPU_V100_SU = os.getenv("RATE_GPU_V100_SU")
105+
RATE_GPU_A100SXM4_SU = os.getenv("RATE_GPU_A100SXM4_SU")
106+
RATE_GPU_A100_SU = os.getenv("RATE_GPU_A100_SU")
107+
RATE_GPU_H100_SU = os.getenv("RATE_GPU_H100_SU")
108+
109+
# Legacy rates dictionary (for backward compatibility)
110+
# Note: This would need to import constants if used, but it's marked as legacy
111+
RATES = {
112+
# "NVIDIA-A100-40GB": Decimal(os.getenv("GPU_A100_RATE")) if os.getenv("GPU_A100_RATE") else None,
113+
}
114+
115+
# =============================================================================
116+
# BUSINESS LOGIC CONFIGURATION
117+
# =============================================================================
118+
119+
# Namespaces that support class-based reporting
120+
NAMESPACES_WITH_CLASSES = os.getenv("NAMESPACES_WITH_CLASSES", "rhods-notebooks").split(",")
121+
122+
# Default filename patterns
123+
DEFAULT_FILENAME_PATTERNS = {
124+
"INVOICE_FILE": "NERC OpenShift {report_month}.csv",
125+
"POD_REPORT_FILE": "Pod NERC OpenShift {report_month}.csv",
126+
"CLASS_INVOICE_FILE": "NERC OpenShift Classes {report_month}.csv",
127+
"OUTPUT_FILE_SINGLE": "metrics-{report_date}.json",
128+
"OUTPUT_FILE_RANGE": "metrics-{start_date}-to-{end_date}.json",
129+
}

openshift_metrics/constants.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Business logic constants for the openshift metrics system.
3+
4+
These are fixed constants that define the business logic and don't change between deployments.
5+
For configurable values, see config.py
6+
"""
7+
8+
# =============================================================================
9+
# GPU TYPES
10+
# =============================================================================
11+
12+
GPU_A100 = "NVIDIA-A100-40GB"
13+
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
14+
GPU_V100 = "Tesla-V100-PCIE-32GB"
15+
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
16+
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
17+
18+
# =============================================================================
19+
# GPU RESOURCE - MIG GEOMETRIES
20+
# =============================================================================
21+
22+
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
23+
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
24+
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
25+
WHOLE_GPU = "nvidia.com/gpu"
26+
27+
# =============================================================================
28+
# VM GPU RESOURCES
29+
# =============================================================================
30+
31+
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
32+
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
33+
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
34+
35+
# =============================================================================
36+
# SERVICE UNIT TYPES
37+
# =============================================================================
38+
39+
SU_CPU = "OpenShift CPU"
40+
SU_A100_GPU = "OpenShift GPUA100"
41+
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
42+
SU_V100_GPU = "OpenShift GPUV100"
43+
SU_H100_GPU = "OpenShift GPUH100"
44+
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
45+
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
46+
SU_UNKNOWN = "Openshift Unknown"

openshift_metrics/invoice.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,35 @@
55
from decimal import Decimal, ROUND_HALF_UP
66
import datetime
77

8-
# GPU types
9-
GPU_A100 = "NVIDIA-A100-40GB"
10-
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
11-
GPU_V100 = "Tesla-V100-PCIE-32GB"
12-
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
13-
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
8+
from openshift_metrics import constants
9+
10+
# Import constants from centralized constants module
11+
GPU_A100 = constants.GPU_A100
12+
GPU_A100_SXM4 = constants.GPU_A100_SXM4
13+
GPU_V100 = constants.GPU_V100
14+
GPU_H100 = constants.GPU_H100
15+
GPU_UNKNOWN_TYPE = constants.GPU_UNKNOWN_TYPE
1416

1517
# GPU Resource - MIG Geometries
16-
# A100 Strategies
17-
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
18-
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
19-
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
20-
WHOLE_GPU = "nvidia.com/gpu"
18+
MIG_1G_5GB = constants.MIG_1G_5GB
19+
MIG_2G_10GB = constants.MIG_2G_10GB
20+
MIG_3G_20GB = constants.MIG_3G_20GB
21+
WHOLE_GPU = constants.WHOLE_GPU
2122

2223
# VM GPU Resources
23-
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
24-
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
25-
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
24+
VM_GPU_H100 = constants.VM_GPU_H100
25+
VM_GPU_A100_SXM4 = constants.VM_GPU_A100_SXM4
26+
VM_GPU_V100 = constants.VM_GPU_V100
2627

2728
# SU Types
28-
SU_CPU = "OpenShift CPU"
29-
SU_A100_GPU = "OpenShift GPUA100"
30-
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
31-
SU_V100_GPU = "OpenShift GPUV100"
32-
SU_H100_GPU = "OpenShift GPUH100"
33-
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
34-
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
35-
SU_UNKNOWN = "Openshift Unknown"
29+
SU_CPU = constants.SU_CPU
30+
SU_A100_GPU = constants.SU_A100_GPU
31+
SU_A100_SXM4_GPU = constants.SU_A100_SXM4_GPU
32+
SU_V100_GPU = constants.SU_V100_GPU
33+
SU_H100_GPU = constants.SU_H100_GPU
34+
SU_UNKNOWN_GPU = constants.SU_UNKNOWN_GPU
35+
SU_UNKNOWN_MIG_GPU = constants.SU_UNKNOWN_MIG_GPU
36+
SU_UNKNOWN = constants.SU_UNKNOWN
3637

3738
ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])
3839

0 commit comments

Comments
 (0)