Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions openshift_metrics/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# =============================================================================
# OPENSHIFT METRICS CONFIGURATION
# =============================================================================
# This file contains all configuration options for the openshift metrics system.
# All variables are lexicographically identical to the python variables in config.py
#
# TO USE: Copy this file to .env in the openshift_metrics directory
# Command: cp env_config.txt .env
# Then modify the values as needed for your environment.

# =============================================================================
# INFRASTRUCTURE CONFIGURATION
# =============================================================================

# OpenShift/Prometheus Configuration
OPENSHIFT_PROMETHEUS_URL=https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org
OPENSHIFT_TOKEN=your_openshift_token_here

# S3 Configuration
S3_OUTPUT_ENDPOINT_URL=https://s3.us-east-005.backblazeb2.com
S3_OUTPUT_ACCESS_KEY_ID=your_s3_access_key_here
S3_OUTPUT_SECRET_ACCESS_KEY=your_s3_secret_key_here
S3_INVOICE_BUCKET=nerc-invoicing
S3_METRICS_BUCKET=openshift_metrics

# =============================================================================
# PROCESSING CONFIGURATION
# =============================================================================

# Metrics processing intervals
INTERVAL_MINUTES=15
STEP_MINUTES=15
GPU_MAPPING_FILE=gpu_node_map.json

# HTTP retry configuration
HTTP_RETRY_TOTAL=3
HTTP_RETRY_BACKOFF_FACTOR=1

# =============================================================================
# REPORT CONFIGURATION (formerly CLI arguments)
# =============================================================================

# Report dates (leave empty to use defaults - yesterday)
REPORT_START_DATE=
REPORT_END_DATE=

# Upload configuration
UPLOAD_TO_S3=false

# File configuration (leave empty to use default naming patterns)
OUTPUT_FILE=
INVOICE_FILE=
POD_REPORT_FILE=
CLASS_INVOICE_FILE=

# Ignore hours configuration (comma-separated timestamp ranges)
# Format: YYYY-MM-DDTHH:MM:SS,YYYY-MM-DDTHH:MM:SS
IGNORE_HOURS=

# =============================================================================
# RATES AND BILLING CONFIGURATION
# =============================================================================

# Rate source configuration
USE_NERC_RATES=false

# Individual rates (Decimal values)
RATE_CPU_SU=0.013
RATE_GPU_V100_SU=1.214
RATE_GPU_A100SXM4_SU=2.078
RATE_GPU_A100_SU=1.803
RATE_GPU_H100_SU=6.04

# Legacy rate (for backward compatibility)
GPU_A100_RATE=1.803

# =============================================================================
# BUSINESS LOGIC CONFIGURATION
# =============================================================================

# Namespaces that support class-based reporting (comma-separated)
NAMESPACES_WITH_CLASSES=rhods-notebooks

# =============================================================================
# GPU CONFIGURATION
# =============================================================================

# GPU types
GPU_A100=NVIDIA-A100-40GB
GPU_A100_SXM4=NVIDIA-A100-SXM4-40GB
GPU_V100=Tesla-V100-PCIE-32GB
GPU_H100=NVIDIA-H100-80GB-HBM3

# GPU Resource - MIG Geometries
MIG_1G_5GB=nvidia.com/mig-1g.5gb
MIG_2G_10GB=nvidia.com/mig-2g.10gb
MIG_3G_20GB=nvidia.com/mig-3g.20gb
WHOLE_GPU=nvidia.com/gpu

# VM GPU Resources
VM_GPU_H100=nvidia.com/H100_SXM5_80GB
VM_GPU_A100_SXM4=nvidia.com/A100_SXM4_40GB
VM_GPU_V100=nvidia.com/GV100GL_Tesla_V100
136 changes: 136 additions & 0 deletions openshift_metrics/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""
Config for the openshift metrics.
All values are set in the .env file
All variables in the .env file are lexicographically identical to the python variables below
"""

import os
from datetime import datetime, timedelta

# =============================================================================
# HARDCODED CONSTANTS (rarely change, application-specific)
# =============================================================================

# Prometheus query strings
PROMETHEUS_QUERIES = {
"CPU_REQUEST": 'kube_pod_resource_request{resource="cpu", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
"MEMORY_REQUEST": 'kube_pod_resource_request{resource="memory", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
"GPU_REQUEST": 'kube_pod_resource_request{resource=~"nvidia.com.*", node!=""} unless on(pod, namespace) kube_pod_status_unschedulable',
"KUBE_NODE_LABELS": 'kube_node_labels{label_nvidia_com_gpu_product!=""}',
"KUBE_POD_LABELS": 'kube_pod_labels{label_nerc_mghpcc_org_class!=""}',
}

# Cluster name mappings
CLUSTER_NAME_MAPPING = {
"https://thanos-querier-openshift-monitoring.apps.shift.nerc.mghpcc.org": "ocp-prod",
"https://thanos-querier-openshift-monitoring.apps.ocp-test.nerc.mghpcc.org": "ocp-test",
"https://thanos-querier-openshift-monitoring.apps.edu.nerc.mghpcc.org": "academic",
}

# Default values for empty fields
DEFAULT_VALUES = {
"UNKNOWN_NODE": "Unknown Node",
"UNKNOWN_MODEL": "Unknown Model",
"EMPTY_STRING": "",
}

# =============================================================================
# BUSINESS LOGIC CONSTANTS
# =============================================================================
# Note: Business logic constants (GPU types, SU types, etc.) are now in constants.py
# This file only contains truly configurable values that change between deployments

# =============================================================================
# INFRASTRUCTURE CONFIGURATION
# =============================================================================

# OpenShift/Prometheus
OPENSHIFT_PROMETHEUS_URL = os.getenv("OPENSHIFT_PROMETHEUS_URL")
OPENSHIFT_TOKEN = os.getenv("OPENSHIFT_TOKEN")

# S3 Configuration
S3_ENDPOINT_URL = os.getenv(
"S3_OUTPUT_ENDPOINT_URL", "https://s3.us-east-005.backblazeb2.com"
)
S3_ACCESS_KEY_ID = os.getenv("S3_OUTPUT_ACCESS_KEY_ID")
S3_SECRET_ACCESS_KEY = os.getenv("S3_OUTPUT_SECRET_ACCESS_KEY")
S3_INVOICE_BUCKET = os.getenv("S3_INVOICE_BUCKET", "nerc-invoicing")
S3_METRICS_BUCKET = os.getenv("S3_METRICS_BUCKET", "openshift_metrics")

# =============================================================================
# PROCESSING CONFIGURATION
# =============================================================================

# Metrics processing
INTERVAL_MINUTES = int(os.getenv("INTERVAL_MINUTES", "15"))
STEP_MINUTES = int(os.getenv("STEP_MINUTES", "15"))
GPU_MAPPING_FILE = os.getenv("GPU_MAPPING_FILE", "gpu_node_map.json")

# HTTP retry configuration
HTTP_RETRY_CONFIG = {
"total": int(os.getenv("HTTP_RETRY_TOTAL", "3")),
"backoff_factor": int(os.getenv("HTTP_RETRY_BACKOFF_FACTOR", "1")),
"status_forcelist": [429, 500, 502, 503, 504],
}

# =============================================================================
# REPORT CONFIGURATION (formerly CLI arguments)
# =============================================================================

# Report dates (with defaults)
REPORT_START_DATE = os.getenv(
"REPORT_START_DATE", (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
)
REPORT_END_DATE = os.getenv(
"REPORT_END_DATE", (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
)

# Upload configuration
UPLOAD_TO_S3 = os.getenv("UPLOAD_TO_S3", "false").lower() == "true"

# File configuration
OUTPUT_FILE = os.getenv("OUTPUT_FILE")
INVOICE_FILE = os.getenv("INVOICE_FILE")
POD_REPORT_FILE = os.getenv("POD_REPORT_FILE")
CLASS_INVOICE_FILE = os.getenv("CLASS_INVOICE_FILE")

# Ignore hours configuration (comma-separated timestamp ranges)
IGNORE_HOURS = os.getenv("IGNORE_HOURS", "")

# =============================================================================
# RATES AND BILLING CONFIGURATION
# =============================================================================

# Rate source configuration
USE_NERC_RATES = os.getenv("USE_NERC_RATES", "false").lower() == "true"

# Individual rates (Decimal values)
RATE_CPU_SU = os.getenv("RATE_CPU_SU")
RATE_GPU_V100_SU = os.getenv("RATE_GPU_V100_SU")
RATE_GPU_A100SXM4_SU = os.getenv("RATE_GPU_A100SXM4_SU")
RATE_GPU_A100_SU = os.getenv("RATE_GPU_A100_SU")
RATE_GPU_H100_SU = os.getenv("RATE_GPU_H100_SU")

# Legacy rates dictionary (for backward compatibility)
# Note: This would need to import constants if used, but it's marked as legacy
RATES = {
# "NVIDIA-A100-40GB": Decimal(os.getenv("GPU_A100_RATE")) if os.getenv("GPU_A100_RATE") else None,
}

# =============================================================================
# BUSINESS LOGIC CONFIGURATION
# =============================================================================

# Namespaces that support class-based reporting
NAMESPACES_WITH_CLASSES = os.getenv("NAMESPACES_WITH_CLASSES", "rhods-notebooks").split(
","
)

# Default filename patterns
DEFAULT_FILENAME_PATTERNS = {
"INVOICE_FILE": "NERC OpenShift {report_month}.csv",
"POD_REPORT_FILE": "Pod NERC OpenShift {report_month}.csv",
"CLASS_INVOICE_FILE": "NERC OpenShift Classes {report_month}.csv",
"OUTPUT_FILE_SINGLE": "metrics-{report_date}.json",
"OUTPUT_FILE_RANGE": "metrics-{start_date}-to-{end_date}.json",
}
46 changes: 46 additions & 0 deletions openshift_metrics/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Business logic constants for the openshift metrics system.

These are fixed constants that define the business logic and don't change between deployments.
For configurable values, see config.py
"""

# =============================================================================
# GPU TYPES
# =============================================================================

GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"

# =============================================================================
# GPU RESOURCE - MIG GEOMETRIES
# =============================================================================

MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"

# =============================================================================
# VM GPU RESOURCES
# =============================================================================

VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"

# =============================================================================
# SERVICE UNIT TYPES
# =============================================================================

SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_H100_GPU = "OpenShift GPUH100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"
45 changes: 23 additions & 22 deletions openshift_metrics/invoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,35 @@
from decimal import Decimal, ROUND_HALF_UP
import datetime

# GPU types
GPU_A100 = "NVIDIA-A100-40GB"
GPU_A100_SXM4 = "NVIDIA-A100-SXM4-40GB"
GPU_V100 = "Tesla-V100-PCIE-32GB"
GPU_H100 = "NVIDIA-H100-80GB-HBM3"
GPU_UNKNOWN_TYPE = "GPU_UNKNOWN_TYPE"
from openshift_metrics import constants

# Import constants from centralized constants module
GPU_A100 = constants.GPU_A100
GPU_A100_SXM4 = constants.GPU_A100_SXM4
GPU_V100 = constants.GPU_V100
GPU_H100 = constants.GPU_H100
GPU_UNKNOWN_TYPE = constants.GPU_UNKNOWN_TYPE

# GPU Resource - MIG Geometries
# A100 Strategies
MIG_1G_5GB = "nvidia.com/mig-1g.5gb"
MIG_2G_10GB = "nvidia.com/mig-2g.10gb"
MIG_3G_20GB = "nvidia.com/mig-3g.20gb"
WHOLE_GPU = "nvidia.com/gpu"
MIG_1G_5GB = constants.MIG_1G_5GB
MIG_2G_10GB = constants.MIG_2G_10GB
MIG_3G_20GB = constants.MIG_3G_20GB
WHOLE_GPU = constants.WHOLE_GPU

# VM GPU Resources
VM_GPU_H100 = "nvidia.com/H100_SXM5_80GB"
VM_GPU_A100_SXM4 = "nvidia.com/A100_SXM4_40GB"
VM_GPU_V100 = "nvidia.com/GV100GL_Tesla_V100"
VM_GPU_H100 = constants.VM_GPU_H100
VM_GPU_A100_SXM4 = constants.VM_GPU_A100_SXM4
VM_GPU_V100 = constants.VM_GPU_V100

# SU Types
SU_CPU = "OpenShift CPU"
SU_A100_GPU = "OpenShift GPUA100"
SU_A100_SXM4_GPU = "OpenShift GPUA100SXM4"
SU_V100_GPU = "OpenShift GPUV100"
SU_H100_GPU = "OpenShift GPUH100"
SU_UNKNOWN_GPU = "OpenShift Unknown GPU"
SU_UNKNOWN_MIG_GPU = "OpenShift Unknown MIG GPU"
SU_UNKNOWN = "Openshift Unknown"
SU_CPU = constants.SU_CPU
SU_A100_GPU = constants.SU_A100_GPU
SU_A100_SXM4_GPU = constants.SU_A100_SXM4_GPU
SU_V100_GPU = constants.SU_V100_GPU
SU_H100_GPU = constants.SU_H100_GPU
SU_UNKNOWN_GPU = constants.SU_UNKNOWN_GPU
SU_UNKNOWN_MIG_GPU = constants.SU_UNKNOWN_MIG_GPU
SU_UNKNOWN = constants.SU_UNKNOWN

ServiceUnit = namedtuple("ServiceUnit", ["su_type", "su_count", "determinig_resource"])

Expand Down
Loading