diff --git a/.github/helm-e2e/action.yaml b/.github/helm-e2e/action.yaml new file mode 100644 index 00000000..982e1291 --- /dev/null +++ b/.github/helm-e2e/action.yaml @@ -0,0 +1,18 @@ +--- +name: Helm E2E Test +description: Tests Helm chart installation and operator deployment +inputs: + version: + description: Operator version to install + required: true +runs: + using: composite + steps: + - name: Run Helm E2E tests + shell: bash + run: | + ./tests/helm.sh \ + --running-on-vm \ + --version=${{ inputs.version }} + env: + VERSION: ${{ inputs.version }} diff --git a/.github/workflows/pr-checks.yaml b/.github/workflows/pr-checks.yaml index ee4c3f11..9cb5ea38 100644 --- a/.github/workflows/pr-checks.yaml +++ b/.github/workflows/pr-checks.yaml @@ -386,3 +386,62 @@ jobs: with: name: cluster-state path: cluster-state + + helm-validate: + runs-on: ubuntu-latest + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: false + + - name: Install all tools + uses: ./.github/tools-cache + + - name: Run Helm validation + run: hack/helm/validate.sh + + helm-e2e: + needs: [bundle, helm-validate] + env: + KIND_VERSION: 0.27.0 + KIND_WORKER_NODES: 2 + name: helm-e2e + runs-on: ubuntu-latest-16-cores + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: false + + - name: Install all tools + uses: ./.github/tools-cache + + - name: Setup cluster with prerequisites + run: make cluster-up + env: + PROMETHEUS_ENABLE: "true" + + - name: Compute version + uses: ./.github/compute-version + id: version + + - name: Run Helm E2E tests + uses: ./.github/helm-e2e + with: + version: ${{ steps.version.outputs.version }} + + - name: Archive cluster state + if: always() + uses: actions/upload-artifact@v4 + with: + name: helm-cluster-state + path: cluster-state diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5cb6dafc..d8ee2027 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: rev: v1.37.0 hooks: - id: yamllint - exclude: ^(bundle|config|hack/crd) + exclude: ^(bundle|config|hack/crd|manifests/helm) - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.44.0 @@ -61,7 +61,7 @@ repos: hooks: - id: commitlint stages: [commit-msg] - additional_dependencies: ['@commitlint/config-conventional'] # yamllint disable-line rule:quoted-strings + additional_dependencies: ["@commitlint/config-conventional"] # yamllint disable-line rule:quoted-strings - repo: https://github.com/fsfe/reuse-tool rev: v5.0.2 diff --git a/Makefile b/Makefile index a950e18b..ed3ce084 100644 --- a/Makefile +++ b/Makefile @@ -295,6 +295,51 @@ undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/confi $(KUSTOMIZE) build config/default/k8s | \ kubectl delete --ignore-not-found=$(ignore-not-found) -f - +##@ Helm Deployment + +HELM_CHART_DIR := manifests/helm/kepler-operator +HELM_RELEASE_NAME ?= kepler-operator +HELM_NAMESPACE ?= kepler-operator +HELM_TIMEOUT ?= 2m + +.PHONY: helm-template +helm-template: helm manifests ## Generate manifests from Helm chart + $(HELM) template $(HELM_RELEASE_NAME) $(HELM_CHART_DIR) \ + --namespace $(HELM_NAMESPACE) \ + --set operator.image=$(OPERATOR_IMG) \ + --set kepler.image=$(KEPLER_IMG) \ + --set kube-rbac-proxy.image=$(KUBE_RBAC_PROXY_IMG) + +.PHONY: helm-install +helm-install: helm manifests helm-sync-crds ## Install operator via Helm + $(HELM) upgrade --install $(HELM_RELEASE_NAME) $(HELM_CHART_DIR) \ + --namespace $(HELM_NAMESPACE) \ + --create-namespace \ + --set operator.image=$(OPERATOR_IMG) \ + --set kepler.image=$(KEPLER_IMG) \ + --set kube-rbac-proxy.image=$(KUBE_RBAC_PROXY_IMG) \ + --timeout $(HELM_TIMEOUT) \ + --wait + +.PHONY: helm-uninstall +helm-uninstall: helm ## Uninstall operator via Helm + $(HELM) uninstall $(HELM_RELEASE_NAME) --namespace $(HELM_NAMESPACE) + +.PHONY: helm-package +helm-package: helm manifests helm-sync-crds ## Package the Helm chart + $(HELM) package $(HELM_CHART_DIR) --destination tmp/ + +.PHONY: helm-sync-crds +helm-sync-crds: ## Sync CRDs from config/crd/bases to Helm chart + @mkdir -p $(HELM_CHART_DIR)/crds + cp config/crd/bases/*.yaml $(HELM_CHART_DIR)/crds/ + @echo "โœ… CRDs synced to Helm chart" + +.PHONY: helm-validate +helm-validate: kustomize helm yq ## Validate Helm chart (syntax, templates, CRD sync, resources) + @echo "Validating Helm chart against kustomize..." + ./hack/helm/validate.sh + ##@ Build Dependencies ## Location where binaries are installed @@ -304,11 +349,13 @@ LOCALBIN ?= $(shell pwd)/tmp/bin KUSTOMIZE ?= $(LOCALBIN)/kustomize CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen CRDOC ?= $(LOCALBIN)/crdoc +HELM ?= $(LOCALBIN)/helm # NOTE: please keep this list sorted so that it can be easily searched TOOLS = controller-gen \ crdoc \ govulncheck \ + helm \ jq \ kubectl \ kustomize \ diff --git a/README.md b/README.md index b63f51d7..603e7bee 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ Deploy the operator and its dependencies: ```sh make tools kubectl create -f https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.76.0/bundle.yaml -kubectl create -f https://github.com/jetstack/cert-manager/releases/download/v1.15.3/cert-manager.yaml +kubectl create -f https://github.com/cert-manager/cert-manager/releases/download/v1.18.2/cert-manager.yaml make deploy kubectl apply -k config/samples/ ``` diff --git a/docs/developer/README.md b/docs/developer/README.md index 600f6c78..b3f32192 100644 --- a/docs/developer/README.md +++ b/docs/developer/README.md @@ -62,3 +62,8 @@ * Kube Builder Book: * Operator SDK Getting Started: * Kubernetes Programming Book: + +# Developer Guides + +* [Helm Chart Maintenance](helm-chart-maintenance.md) - How to update and maintain the Helm chart +* [Pre-commit Hooks](pre-commit-hooks.md) - Setting up and using pre-commit hooks diff --git a/docs/developer/helm-chart-maintenance.md b/docs/developer/helm-chart-maintenance.md new file mode 100644 index 00000000..f54e2447 --- /dev/null +++ b/docs/developer/helm-chart-maintenance.md @@ -0,0 +1,432 @@ +# โŽˆ Helm Chart Maintenance Guide + +This guide explains how to maintain and update the Helm chart for the Kepler Operator. + +--- + +## ๐Ÿ“‹ Overview + +The Helm chart uses a **hybrid automation approach**: + +- **Manual**: Templates are hand-crafted for full control and customization +- **Automated**: CRDs are automatically synced from `config/crd/bases/` +- **Validated**: Automated checks ensure consistency with kustomize deployment + +This approach balances maintainability with flexibility. + +--- + +## ๐Ÿ—‚๏ธ Chart Structure + +```text +manifests/helm/kepler-operator/ +โ”œโ”€โ”€ Chart.yaml # Chart metadata (version, appVersion) +โ”œโ”€โ”€ values.yaml # Default configuration values +โ”œโ”€โ”€ README.md # User-facing installation guide +โ”œโ”€โ”€ .helmignore # Files excluded from packaging +โ”œโ”€โ”€ crds/ # CRDs (auto-synced from config/crd/bases/) +โ”‚ โ”œโ”€โ”€ kepler.system...powermonitors.yaml +โ”‚ โ””โ”€โ”€ kepler.system...powermonitorinternals.yaml +โ””โ”€โ”€ templates/ + โ”œโ”€โ”€ _helpers.tpl # Template helper functions + โ”œโ”€โ”€ NOTES.txt # Post-install instructions + โ”œโ”€โ”€ serviceaccount.yaml + โ”œโ”€โ”€ rbac.yaml # All RBAC resources + โ”œโ”€โ”€ deployment.yaml + โ”œโ”€โ”€ services.yaml # Metrics + webhook services + โ”œโ”€โ”€ certificate.yaml # cert-manager resources (conditional) + โ”œโ”€โ”€ webhooks.yaml # Webhook configurations (conditional) + โ””โ”€โ”€ servicemonitor.yaml # Prometheus ServiceMonitor (conditional) +``` + +--- + +## ๐Ÿ”„ When to Update the Helm Chart + +| Change Type | Action Required | Files to Update | +|-------------|----------------|-----------------| +| **CRD Modified** | Run `make helm-sync-crds` | Auto-synced to `crds/` | +| **RBAC Changed** | Manual template update | `templates/rbac.yaml` | +| **Deployment Changed** | Manual template update | `templates/deployment.yaml` | +| **New Resource Added** | Create new template | `templates/.yaml` | +| **Config Option Added** | Update values & templates | `values.yaml` + relevant template | +| **Version Bump** | Update chart metadata | `Chart.yaml` (version, appVersion) | + +--- + +## ๐Ÿ› ๏ธ Update Workflow + +### 1. Make Changes + +```bash +# If CRDs changed, sync them +make helm-sync-crds + +# If templates changed, edit manually +vim manifests/helm/kepler-operator/templates/.yaml + +# If configuration changed, update values +vim manifests/helm/kepler-operator/values.yaml +``` + +### 2. Validate Changes + +```bash +# Run all validation tests (recommended) +make helm-validate # Full validation (syntax, templates, CRD sync, resources) + +# Or preview rendered manifests: +make helm-template # Preview rendered manifests +``` + +### 3. Test Locally (Optional) + +```bash +# Full end-to-end test (recommended) +./tests/helm.sh + +# Or manual testing: +make helm-install # Install to cluster +kubectl get all -n kepler-operator # Verify deployment +make helm-uninstall # Clean up + +# Advanced: test with existing image +./tests/helm.sh --no-build --version=0.21.0 +``` + +--- + +## โœ๏ธ Creating/Updating Templates + +### Use Kustomize as Reference + +**Important**: Always use `config/default/k8s` as your source of truth, NOT `config/manifests`. + +```bash +# Generate reference manifest +make manifests +kustomize build config/default/k8s > /tmp/kustomize-ref.yaml + +# Extract specific resources +./tmp/bin/yq 'select(.kind == "Deployment")' /tmp/kustomize-ref.yaml +./tmp/bin/yq 'select(.kind == "Service")' /tmp/kustomize-ref.yaml +``` + +**Why `config/default/k8s`?** + +- `config/default/k8s`: Standard Kubernetes deployment (matches Helm use case) +- `config/manifests`: OLM-specific with ClusterServiceVersion (different model) + +### Template Creation Steps + +1. Extract resource from kustomize output +2. Replace hardcoded values with template helpers: + - Names: `{{ include "kepler-operator.fullname" . }}-` + - Namespace: `{{ include "kepler-operator.namespace" . }}` + - Labels: `{{ include "kepler-operator.labels" . | nindent 4 }}` + - Images: `{{ include "kepler-operator.image" . }}` +3. Add conditional rendering if needed: + + ```yaml + {{- if .Values.feature.enabled }} + # resource definition + {{- end }} + ``` + +4. Use values from `values.yaml`: + + ```yaml + replicas: {{ .Values.replicaCount }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + ``` + +### Helper Function Reference + +Common helpers available in `templates/_helpers.tpl`: + +```yaml +# Chart name +{{ include "kepler-operator.name" . }} + +# Full name (release-name + chart-name) +{{ include "kepler-operator.fullname" . }} + +# Namespace +{{ include "kepler-operator.namespace" . }} + +# Standard labels +{{ include "kepler-operator.labels" . | nindent 4 }} + +# Selector labels (stable, for pod selectors) +{{ include "kepler-operator.managerLabels" . | nindent 6 }} + +# Image references +{{ include "kepler-operator.image" . }} # Operator image +{{ include "kepler-operator.keplerImage" . }} # Kepler image +{{ include "kepler-operator.kubeRbacProxyImage" . }} # Kube RBAC Proxy image + +# Service account name +{{ include "kepler-operator.serviceAccountName" . }} +``` + +--- + +## ๐Ÿงช Validation Details + +The `make helm-validate` command runs three layers of checks: + +### Layer 1: Syntax Validation + +```bash +helm lint manifests/helm/kepler-operator +``` + +- Validates Chart.yaml structure +- Checks template syntax +- Verifies values.yaml schema + +### Layer 2: Template Rendering + +```bash +helm template kepler-operator manifests/helm/kepler-operator \ + --set metrics.serviceMonitor.enabled=true +``` + +- Ensures templates render without errors +- Tests value substitution +- Validates conditional logic + +### Layer 3: Consistency Checks + +```bash +./hack/helm/validate.sh +``` + +- Verifies CRD sync status (CRDs match `config/crd/bases/`) +- Validates all expected resources present +- Checks project-local tools available + +--- + +## ๐Ÿ’ก Common Patterns + +### Conditional Resources + +Use feature flags in `values.yaml`: + +```yaml +# values.yaml +webhooks: + enabled: true + certManager: + enabled: true +``` + +Then wrap entire templates: + +```yaml +# templates/certificate.yaml +{{- if .Values.webhooks.certManager.enabled }} +# Certificate and Issuer resources +{{- end }} +``` + +### Multi-Resource Templates + +Group related resources in single file with `---` separator: + +```yaml +# templates/rbac.yaml +# Role +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +... +--- +# RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +... +``` + +### Image Configuration + +Use full image paths for simplicity: + +```yaml +# values.yaml +operator: + image: quay.io/sustainable_computing_io/kepler-operator:0.21.0 + pullPolicy: IfNotPresent + +kepler: + image: quay.io/sustainable_computing_io/kepler:v0.11.0 + +kube-rbac-proxy: + image: quay.io/brancz/kube-rbac-proxy:v0.19.0 + +# _helpers.tpl +{{- define "kepler-operator.image" -}} +{{- .Values.operator.image }} +{{- end }} + +{{- define "kepler-operator.keplerImage" -}} +{{- .Values.kepler.image }} +{{- end }} + +{{- define "kepler-operator.kubeRbacProxyImage" -}} +{{- index .Values "kube-rbac-proxy" "image" }} +{{- end }} +``` + +This approach is simpler and allows overriding with: + +```bash +helm install kepler-operator ./chart \ + --set operator.image=localhost:5001/kepler-operator:dev +``` + +--- + +## โš ๏ธ Common Pitfalls + +### โŒ Wrong Kustomize Overlay + +```bash +kustomize build config/manifests # OLM-specific, wrong! +``` + +โœ… Use: + +```bash +kustomize build config/default/k8s # Vanilla K8s, correct! +``` + +### โŒ Hardcoded Names + +```yaml +name: kepler-operator-controller +namespace: kepler-operator +``` + +โœ… Use helpers: + +```yaml +name: {{ include "kepler-operator.fullname" . }}-controller +namespace: {{ include "kepler-operator.namespace" . }} +``` + +### โŒ Validation Without Optional Resources + +```bash +helm template kepler-operator manifests/helm/kepler-operator +# ServiceMonitor missing! +``` + +โœ… Enable all optionals: + +```bash +helm template kepler-operator manifests/helm/kepler-operator \ + --set metrics.serviceMonitor.enabled=true +``` + +### โŒ Mutable Selector Labels + +```yaml +selector: + matchLabels: + {{- include "kepler-operator.labels" . | nindent 4 }} + # Includes version, breaks on upgrade! +``` + +โœ… Use stable selectors: + +```yaml +selector: + matchLabels: + {{- include "kepler-operator.managerLabels" . | nindent 4 }} +``` + +### โŒ Namespace Template + --create-namespace Flag + +```yaml +# templates/namespace.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: {{ include "kepler-operator.namespace" . }} +``` + +AND using `--create-namespace` flag causes conflict: + +```text +Error: namespaces "kepler-operator" already exists +``` + +โœ… Use **only** `--create-namespace` flag (standard Helm practice): + +```bash +helm install kepler-operator ./chart \ + --namespace kepler-operator \ + --create-namespace # Let Helm create namespace +``` + +**Rationale**: The `--create-namespace` flag is simpler and follows standard Helm conventions. Template-based namespace creation adds unnecessary complexity and potential conflicts. + +--- + +## ๐Ÿ“ฆ Release Process + +When releasing a new version: + +1. **Update Chart.yaml**: + + ```yaml + version: 0.22.0 # Bump chart version + appVersion: 0.22.0 # Match operator version + ``` + +2. **Sync CRDs** (if changed): + + ```bash + make helm-sync-crds + ``` + +3. **Validate**: + + ```bash + make helm-validate # Runs syntax, template, CRD sync, and resource validation + ``` + +4. **Package** (optional): + + ```bash + make helm-package + ``` + +5. **Commit changes**: + + ```bash + git add manifests/helm/kepler-operator/ + git commit -m "chore(helm): bump chart version to 0.22.0" + ``` + +--- + +## ๐Ÿ“š Additional Resources + +- **Helm Best Practices**: +- **Knowledge Base**: `tmp/agents/knowledge/helm-deployment.md` +- **Chart README**: `manifests/helm/kepler-operator/README.md` (user guide) +- **Kustomize Docs**: + +--- + +## ๐Ÿค Getting Help + +- Review existing templates for patterns +- Check validation errors: `make helm-validate` provides specific guidance +- See knowledge base for detailed explanations: `tmp/agents/knowledge/helm-deployment.md` +- Ask in project discussions or issues + +Happy charting! โ›ต diff --git a/hack/cluster.sh b/hack/cluster.sh index 05ce4191..adb2fc79 100755 --- a/hack/cluster.sh +++ b/hack/cluster.sh @@ -7,7 +7,7 @@ declare -r VERSION=${VERSION:-v0.0.3} declare -r CLUSTER_PROVIDER=${CLUSTER_PROVIDER:-kind} declare -r GRAFANA_ENABLE=${GRAFANA_ENABLE:-true} declare -r KIND_WORKER_NODES=${KIND_WORKER_NODES:-2} -declare -r CERTMANAGER_VERSION=${CERT_MANAGER_VERSION:-1.15.0} +declare -r CERTMANAGER_VERSION=${CERT_MANAGER_VERSION:-1.18.2} declare -r OLM_VERSION=${OLM_VERSION:-v0.28.0} # constants @@ -16,7 +16,7 @@ declare -r PROJECT_ROOT declare -r TMP_DIR="$PROJECT_ROOT/tmp" declare -r DEV_CLUSTER_DIR="$TMP_DIR/local-dev-cluster" declare -r BIN_DIR="$TMP_DIR/bin" -declare -r CERTMANAGER_URL="https://github.com/jetstack/cert-manager/releases/download/v$CERTMANAGER_VERSION/cert-manager.yaml" +declare -r CERTMANAGER_URL="https://github.com/cert-manager/cert-manager/releases/download/v$CERTMANAGER_VERSION/cert-manager.yaml" source "$PROJECT_ROOT/hack/utils.bash" diff --git a/hack/helm/validate.sh b/hack/helm/validate.sh new file mode 100755 index 00000000..2e3b27b0 --- /dev/null +++ b/hack/helm/validate.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# Copyright 2025 The Kepler Contributors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +HELM_CHART_DIR="$PROJECT_ROOT/manifests/helm/kepler-operator" +CRD_SOURCE_DIR="$PROJECT_ROOT/config/crd/bases" +CRD_DEST_DIR="$HELM_CHART_DIR/crds" + +# Image versions for validation +OPERATOR_IMAGE="${OPERATOR_IMAGE:-quay.io/sustainable_computing_io/kepler-operator:0.21.0}" +KEPLER_IMAGE="${KEPLER_IMAGE:-quay.io/sustainable_computing_io/kepler:latest}" +KUBE_RBAC_PROXY_IMAGE="${KUBE_RBAC_PROXY_IMAGE:-quay.io/brancz/kube-rbac-proxy:v0.18.1}" + +# shellcheck source=hack/utils.bash +source "$SCRIPT_DIR/../utils.bash" + +# Validate that required tools are available +check_tools() { + local bin_dir="$PROJECT_ROOT/tmp/bin" + local tools=("helm" "kustomize" "yq") + for tool in "${tools[@]}"; do + if [[ ! -x "$bin_dir/$tool" ]]; then + fail "$tool is not installed. Please run 'make $tool' to install it." + return 1 + fi + done +} + +# Use project-local tools +export PATH="$PROJECT_ROOT/tmp/bin:$PATH" + +# Render Helm templates with standard test values +render_helm_template() { + helm template kepler-operator "$HELM_CHART_DIR" \ + --namespace kepler-operator \ + --set operator.image="$OPERATOR_IMAGE" \ + --set kepler.image="$KEPLER_IMAGE" \ + --set kube-rbac-proxy.image="$KUBE_RBAC_PROXY_IMAGE" \ + --set metrics.serviceMonitor.enabled=true +} + +# Validate Helm chart syntax +validate_helm_syntax() { + info "Validating Helm chart syntax..." + + helm lint "$HELM_CHART_DIR" >/dev/null 2>&1 || { + fail "Helm chart syntax validation failed" + helm lint "$HELM_CHART_DIR" + return 1 + + } + ok "Helm chart syntax is valid" + return 0 +} + +# Validate that templates render successfully +validate_helm_template() { + info "Validating Helm templates render successfully..." + + render_helm_template >/dev/null 2>&1 || { + fail "Helm template rendering failed" + render_helm_template + return 1 + } + ok "Helm templates render successfully" + return 0 +} + +# Validate CRD sync status +validate_crd_sync() { + info "Validating CRD sync status..." + local all_synced=true + + for crd_file in "$CRD_SOURCE_DIR"/*.yaml; do + local crd_name + crd_name=$(basename "$crd_file") + local dest_file="$CRD_DEST_DIR/$crd_name" + + [[ -f "$dest_file" ]] || { + fail "CRD $crd_name not found in Helm chart crds/ directory" + all_synced=false + continue + } + + diff -q "$crd_file" "$dest_file" >/dev/null 2>&1 || { + fail "CRD $crd_name is out of sync. Run 'make helm-sync-crds' to sync." + all_synced=false + continue + } + done + + [[ "$all_synced" == "true" ]] || return 1 + ok "All CRDs are synced" + return 0 +} + +# Validate that all expected resources are present +validate_resources() { + info "Validating expected resources are present..." + local expected_resources=( + "ServiceAccount" + "Role" + "ClusterRole" + "RoleBinding" + "ClusterRoleBinding" + "Service" + "Deployment" + "Certificate" + "Issuer" + "MutatingWebhookConfiguration" + "ValidatingWebhookConfiguration" + "ServiceMonitor" + ) + + local rendered + rendered=$(render_helm_template) + + local all_found=true + for resource in "${expected_resources[@]}"; do + echo "$rendered" | grep -q "^kind: $resource$" || { + fail "Expected resource $resource not found in rendered templates" + all_found=false + } + done + + [[ "$all_found" == "true" ]] || return 1 + ok "All expected resources are present" + return 0 +} + +main() { + info "Starting Helm chart validation..." + + check_tools + validate_helm_syntax + validate_helm_template + validate_crd_sync + validate_resources + + ok "Helm chart validation completed successfully" +} + +main "$@" diff --git a/hack/tools.sh b/hack/tools.sh index 743eeb4b..62bb8d22 100755 --- a/hack/tools.sh +++ b/hack/tools.sh @@ -20,6 +20,7 @@ declare -r OC_VERSION=${OC_VERSION:-4.18.1} declare -r KUBECTL_VERSION=${KUBECTL_VERSION:-v1.28.4} declare -r SHFMT_VERSION=${SHFMT_VERSION:-v3.7.0} declare -r JQ_VERSION=${JQ_VERSION:-1.7} +declare -r HELM_VERSION=${HELM_VERSION:-v3.18.1} # install declare -r KUSTOMIZE_INSTALL_SCRIPT="https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" @@ -27,6 +28,7 @@ declare -r OPERATOR_SDK_INSTALL="https://github.com/operator-framework/operator- declare -r YQ_INSTALL="https://github.com/mikefarah/yq/releases/download/$YQ_VERSION/yq_${GOOS}_${GOARCH}" declare -r OC_URL="https://mirror.openshift.com/pub/openshift-v4/clients/ocp/$OC_VERSION" declare -r JQ_INSTALL_URL="https://github.com/jqlang/jq/releases/download/jq-$JQ_VERSION" +declare -r HELM_INSTALL_URL="https://get.helm.sh" source "$PROJECT_ROOT/hack/utils.bash" @@ -225,6 +227,33 @@ install_jq() { ok "jq was installed successfully" } +version_helm() { + helm version +} + +install_helm() { + local version_regex="Version:\"$HELM_VERSION\"" + validate_version helm version "$version_regex" && return 0 + + info "installing helm version: $HELM_VERSION" + local helm_tar="helm-${HELM_VERSION}-${GOOS}-${GOARCH}.tar.gz" + local install_url="$HELM_INSTALL_URL/$helm_tar" + + local helm_tmp="$LOCAL_BIN/tmp-helm" + mkdir -p "$helm_tmp" + + curl -sSL "$install_url" | tar -xzf - -C "$helm_tmp" || { + fail "failed to install helm" + return 1 + } + + mv "$helm_tmp/$GOOS-$GOARCH/helm" "$LOCAL_BIN/" + chmod +x "$LOCAL_BIN/helm" + rm -rf "$helm_tmp" + + ok "helm was installed successfully" +} + install_all() { info "installing all tools ..." local ret=0 diff --git a/manifests/helm/kepler-operator/.helmignore b/manifests/helm/kepler-operator/.helmignore new file mode 100644 index 00000000..43eb8e1d --- /dev/null +++ b/manifests/helm/kepler-operator/.helmignore @@ -0,0 +1,27 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +# Testing and CI files +.travis.yml +.gitlab-ci.yml +OWNERS diff --git a/manifests/helm/kepler-operator/Chart.yaml b/manifests/helm/kepler-operator/Chart.yaml new file mode 100644 index 00000000..f5f69913 --- /dev/null +++ b/manifests/helm/kepler-operator/Chart.yaml @@ -0,0 +1,26 @@ +apiVersion: v2 +name: kepler-operator +description: A Helm chart for deploying the Kepler Operator on Kubernetes +type: application +version: 0.21.0 +appVersion: 0.21.0 +keywords: + - kepler + - power + - energy + - monitoring + - sustainability +home: https://sustainable-computing.io/ +sources: + - https://github.com/sustainable-computing-io/kepler-operator +maintainers: + - name: Sunil Thaha + email: sthaha@redhat.com + - name: Vibhu Prashar + email: vprashar@redhat.com + - name: Vimal Kumar + email: vimalkum@redhat.com + - name: Kaiyi Liu + email: kaliu@redhat.com +icon: https://raw.githubusercontent.com/sustainable-computing-io/kepler-operator/v1alpha1/docs/logo/kepler-icon.svg +kubeVersion: ">=1.24.0" diff --git a/manifests/helm/kepler-operator/README.md b/manifests/helm/kepler-operator/README.md new file mode 100644 index 00000000..2e82c94d --- /dev/null +++ b/manifests/helm/kepler-operator/README.md @@ -0,0 +1,161 @@ +# Kepler Operator Helm Chart + +Helm chart for deploying the Kepler Operator on Kubernetes. + +> **Note**: This guide provides both `make` targets (for developers working from source) and direct `helm` commands (for users installing from packaged charts). + +## Prerequisites + +- Kubernetes >=1.24.0 +- Helm >=3.0.0 +- cert-manager >=1.18.0 (for webhook certificates) + +## Installation + +### Install cert-manager (if not already installed) + +```bash +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.18.2/cert-manager.yaml +``` + +### Install Kepler Operator + +**From source repository:** + +```bash +make helm-install +``` + +**Using Helm directly:** + +```bash +helm install kepler-operator ./manifests/helm/kepler-operator \ + --namespace kepler-operator \ + --create-namespace +``` + +**From packaged chart:** + +```bash +helm install kepler-operator kepler-operator-0.21.0.tgz \ + --namespace kepler-operator \ + --create-namespace +``` + +### Install with custom values + +```bash +helm install kepler-operator ./manifests/helm/kepler-operator \ + --namespace kepler-operator \ + --create-namespace \ + --set operator.image=quay.io/sustainable_computing_io/kepler-operator:v0.21.0 \ + --set kepler.image=quay.io/sustainable_computing_io/kepler:v0.11.0 \ + --set metrics.serviceMonitor.enabled=true +``` + +Or create a custom `values.yaml` and install: + +```bash +helm install kepler-operator ./manifests/helm/kepler-operator \ + --namespace kepler-operator \ + --create-namespace \ + --values custom-values.yaml +``` + +## Configuration + +Key configuration values: + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `operator.image` | Operator image (full path with tag) | `quay.io/sustainable_computing_io/kepler-operator:0.21.0` | +| `operator.pullPolicy` | Image pull policy | `IfNotPresent` | +| `kepler.image` | Kepler image (full path with tag) | `quay.io/sustainable_computing_io/kepler:v0.11.0` | +| `kube-rbac-proxy.image` | Kube RBAC Proxy image (full path with tag) | `quay.io/brancz/kube-rbac-proxy:v0.19.0` | +| `replicaCount` | Number of operator replicas | `1` | +| `namespace` | Operator namespace | `kepler-operator` | +| `webhooks.enabled` | Enable admission webhooks | `true` | +| `webhooks.certManager.enabled` | Use cert-manager for webhook certificates | `true` | +| `metrics.serviceMonitor.enabled` | Enable Prometheus ServiceMonitor | `false` | + +See [values.yaml](values.yaml) for complete list of configuration options. + +## Creating a PowerMonitor Resource + +After installing the operator, create a PowerMonitor resource: + +```yaml +apiVersion: kepler.system.sustainable.computing.io/v1alpha1 +kind: PowerMonitor +metadata: + name: power-monitor +spec: + kepler: + deployment: + nodeSelector: + kubernetes.io/os: linux + config: + logLevel: info +``` + +## Upgrading + +**From source repository:** + +```bash +make helm-install # Uses helm upgrade --install +``` + +**Using Helm directly:** + +```bash +helm upgrade kepler-operator ./manifests/helm/kepler-operator \ + --namespace kepler-operator +``` + +## Uninstalling + +**From source repository:** + +```bash +make helm-uninstall +``` + +**Using Helm directly:** + +```bash +helm uninstall kepler-operator --namespace kepler-operator +``` + +## Development + +For contributors working on the Helm chart, see the [Helm Chart Maintenance Guide](../../../docs/developer/helm-chart-maintenance.md). + +### Testing + +**Static validation:** + +```bash +make helm-validate # Run all validation tests (syntax, templates, CRD sync, resources) +make helm-template # Preview rendered manifests +``` + +**End-to-end testing:** + +```bash +# Full e2e test (requires cluster with cert-manager) +./tests/helm.sh + +# See all options +./tests/helm.sh --help +``` + +### Syncing CRDs + +```bash +make helm-sync-crds +``` + +## License + +Apache License 2.0 diff --git a/manifests/helm/kepler-operator/crds/kepler.system.sustainable.computing.io_powermonitorinternals.yaml b/manifests/helm/kepler-operator/crds/kepler.system.sustainable.computing.io_powermonitorinternals.yaml new file mode 100644 index 00000000..785e0b51 --- /dev/null +++ b/manifests/helm/kepler-operator/crds/kepler.system.sustainable.computing.io_powermonitorinternals.yaml @@ -0,0 +1,383 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: powermonitorinternals.kepler.system.sustainable.computing.io +spec: + group: kepler.system.sustainable.computing.io + names: + kind: PowerMonitorInternal + listKind: PowerMonitorInternalList + plural: powermonitorinternals + singular: powermonitorinternal + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.kepler.desiredNumberScheduled + name: Desired + type: integer + - jsonPath: .status.kepler.currentNumberScheduled + name: Current + type: integer + - jsonPath: .status.kepler.updatedNumberScheduled + name: Up-to-date + type: integer + - jsonPath: .status.kepler.numberReady + name: Ready + type: integer + - jsonPath: .status.kepler.numberAvailable + name: Available + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.kepler.deployment.image + name: Image + type: string + - jsonPath: .spec.kepler.deployment.nodeSelector + name: Node-Selector + priority: 10 + type: string + - jsonPath: .spec.kepler.deployment.tolerations + name: Tolerations + priority: 10 + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: PowerMonitorInternal is the Schema for the internal kepler 2 + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: PowerMonitorInternalSpec defines the desired state of PowerMonitorInternalSpec + properties: + kepler: + properties: + config: + properties: + additionalConfigMaps: + description: |- + AdditionalConfigMaps is a list of ConfigMap names that will be merged with the default ConfigMap + These AdditionalConfigMaps must exist in the same namespace as PowerMonitor components + items: + description: ConfigMapRef defines a reference to a ConfigMap + properties: + name: + description: Name of the ConfigMap + minLength: 1 + type: string + required: + - name + type: object + type: array + x-kubernetes-list-type: atomic + logLevel: + default: info + type: string + maxTerminated: + default: 500 + description: |- + MaxTerminated controls terminated workload tracking behavior + Negative values: track unlimited terminated workloads (no capacity limit) + Zero: disable terminated workload tracking completely + Positive values: track top N terminated workloads by energy consumption + format: int32 + type: integer + metricLevels: + default: + - node + - pod + - vm + description: |- + MetricLevels specifies which metrics levels to export + Valid values are combinations of: node, process, container, vm, pod + items: + enum: + - node + - process + - container + - vm + - pod + type: string + type: array + x-kubernetes-list-type: set + sampleRate: + default: 5s + description: |- + SampleRate specifies the interval for monitoring resources (processes, containers, vms, etc.) + Must be a positive duration (e.g., "5s", "1m", "30s"). Negative values are not allowed. + pattern: ^[0-9]+(\.[0-9]+)?(ns|us|ms|s|m|h)$ + type: string + staleness: + default: 500ms + description: |- + Staleness specifies how long to wait before considering calculated power values as stale + Must be a positive duration (e.g., "500ms", "5s", "1h"). Negative values are not allowed. + pattern: ^[0-9]+(\.[0-9]+)?(ns|us|ms|s|m|h)$ + type: string + type: object + deployment: + properties: + image: + minLength: 3 + type: string + kubeRbacProxyImage: + minLength: 3 + type: string + namespace: + minLength: 1 + type: string + nodeSelector: + additionalProperties: + type: string + default: + kubernetes.io/os: linux + description: Defines which Nodes the Pod is scheduled on + type: object + secrets: + description: Secrets to be mounted in the power monitor containers + items: + description: |- + SecretRef defines a reference to a Secret to be mounted + + Mount Path Cautions: + Exercise caution when setting mount paths for secrets. Avoid mounting secrets to critical system paths + that may interfere with Kepler's operation or container security: + - /etc/kepler - Reserved for Kepler configuration files + - /sys, /proc, /dev - System directories that should remain read-only + - /usr, /bin, /sbin, /lib - System binaries and libraries + - / - Root filesystem + + Best practices: + - Use subdirectories like /etc/kepler/secrets/ or /opt/secrets/ + - Ensure mount paths don't conflict with existing volume mounts + - Test mount paths in development environments before production deployment + - Monitor Kepler pod logs for mount-related errors + properties: + mountPath: + description: MountPath where the secret should be mounted + in the container + minLength: 1 + type: string + name: + description: Name of the secret in the same namespace + as the Kepler deployment + minLength: 1 + type: string + readOnly: + default: true + description: ReadOnly specifies whether the secret should + be mounted read-only + type: boolean + required: + - mountPath + - name + type: object + type: array + x-kubernetes-list-type: atomic + security: + description: If set, defines the security mode and allowed + SANames + properties: + allowedSANames: + items: + type: string + type: array + x-kubernetes-list-type: atomic + mode: + enum: + - none + - rbac + type: string + type: object + tolerations: + default: + - effect: "" + key: "" + operator: Exists + value: "" + description: If specified, define Pod's tolerations + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - image + - namespace + type: object + required: + - deployment + type: object + openshift: + properties: + dashboard: + properties: + enabled: + default: false + type: boolean + type: object + enabled: + default: true + type: boolean + required: + - enabled + type: object + required: + - kepler + type: object + status: + properties: + conditions: + description: conditions represent the latest available observations + of power-monitor-internal + items: + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. + type: string + status: + description: status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of Kepler Condition - Reconciled, Available + ... + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-type: atomic + kepler: + properties: + currentNumberScheduled: + description: |- + The number of nodes that are running at least 1 power-monitor-internal pod and are + supposed to run the power-monitor-internal pod. + format: int32 + type: integer + desiredNumberScheduled: + description: |- + The total number of nodes that should be running the power-monitor-internal + pod (including nodes correctly running the power-monitor-internal pod). + format: int32 + type: integer + numberAvailable: + description: |- + The number of nodes that should be running the power-monitor-internal pod and have one or + more of the power-monitor-internal pod running and available + format: int32 + type: integer + numberMisscheduled: + description: |- + The number of nodes that are running the power-monitor-internal pod, but are not supposed + to run the power-monitor-internal pod. + format: int32 + type: integer + numberReady: + description: |- + numberReady is the number of nodes that should be running the power-monitor-internal pod + and have one or more of the power-monitor-internal pod running with a Ready Condition. + format: int32 + type: integer + numberUnavailable: + description: |- + The number of nodes that should be running the + power-monitor-internal pod and have none of the power-monitor-internal pod running and available + format: int32 + type: integer + updatedNumberScheduled: + description: The total number of nodes that are running updated + power-monitor-internal pod + format: int32 + type: integer + required: + - currentNumberScheduled + - desiredNumberScheduled + - numberMisscheduled + - numberReady + type: object + required: + - conditions + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/manifests/helm/kepler-operator/crds/kepler.system.sustainable.computing.io_powermonitors.yaml b/manifests/helm/kepler-operator/crds/kepler.system.sustainable.computing.io_powermonitors.yaml new file mode 100644 index 00000000..daae1e1e --- /dev/null +++ b/manifests/helm/kepler-operator/crds/kepler.system.sustainable.computing.io_powermonitors.yaml @@ -0,0 +1,352 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.17.2 + name: powermonitors.kepler.system.sustainable.computing.io +spec: + group: kepler.system.sustainable.computing.io + names: + kind: PowerMonitor + listKind: PowerMonitorList + plural: powermonitors + singular: powermonitor + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.kepler.desiredNumberScheduled + name: Desired + type: integer + - jsonPath: .status.kepler.currentNumberScheduled + name: Current + type: integer + - jsonPath: .status.kepler.numberReady + name: Ready + type: integer + - jsonPath: .status.kepler.updatedNumberScheduled + name: Up-to-date + type: integer + - jsonPath: .status.kepler.numberAvailable + name: Available + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.kepler.deployment.nodeSelector + name: Node-Selector + priority: 10 + type: string + - jsonPath: .spec.kepler.deployment.tolerations + name: Tolerations + priority: 10 + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: PowerMonitor is the Schema for the PowerMonitor API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: PowerMonitorSpec defines the desired state of Power Monitor + properties: + kepler: + properties: + config: + properties: + additionalConfigMaps: + description: |- + AdditionalConfigMaps is a list of ConfigMap names that will be merged with the default ConfigMap + These AdditionalConfigMaps must exist in the same namespace as PowerMonitor components + items: + description: ConfigMapRef defines a reference to a ConfigMap + properties: + name: + description: Name of the ConfigMap + minLength: 1 + type: string + required: + - name + type: object + type: array + x-kubernetes-list-type: atomic + logLevel: + default: info + type: string + maxTerminated: + default: 500 + description: |- + MaxTerminated controls terminated workload tracking behavior + Negative values: track unlimited terminated workloads (no capacity limit) + Zero: disable terminated workload tracking completely + Positive values: track top N terminated workloads by energy consumption + format: int32 + type: integer + metricLevels: + default: + - node + - pod + - vm + description: |- + MetricLevels specifies which metrics levels to export + Valid values are combinations of: node, process, container, vm, pod + items: + enum: + - node + - process + - container + - vm + - pod + type: string + type: array + x-kubernetes-list-type: set + sampleRate: + default: 5s + description: |- + SampleRate specifies the interval for monitoring resources (processes, containers, vms, etc.) + Must be a positive duration (e.g., "5s", "1m", "30s"). Negative values are not allowed. + pattern: ^[0-9]+(\.[0-9]+)?(ns|us|ms|s|m|h)$ + type: string + staleness: + default: 500ms + description: |- + Staleness specifies how long to wait before considering calculated power values as stale + Must be a positive duration (e.g., "500ms", "5s", "1h"). Negative values are not allowed. + pattern: ^[0-9]+(\.[0-9]+)?(ns|us|ms|s|m|h)$ + type: string + type: object + deployment: + properties: + nodeSelector: + additionalProperties: + type: string + default: + kubernetes.io/os: linux + description: Defines which Nodes the Pod is scheduled on + type: object + secrets: + description: Secrets to be mounted in the power monitor containers + items: + description: |- + SecretRef defines a reference to a Secret to be mounted + + Mount Path Cautions: + Exercise caution when setting mount paths for secrets. Avoid mounting secrets to critical system paths + that may interfere with Kepler's operation or container security: + - /etc/kepler - Reserved for Kepler configuration files + - /sys, /proc, /dev - System directories that should remain read-only + - /usr, /bin, /sbin, /lib - System binaries and libraries + - / - Root filesystem + + Best practices: + - Use subdirectories like /etc/kepler/secrets/ or /opt/secrets/ + - Ensure mount paths don't conflict with existing volume mounts + - Test mount paths in development environments before production deployment + - Monitor Kepler pod logs for mount-related errors + properties: + mountPath: + description: MountPath where the secret should be mounted + in the container + minLength: 1 + type: string + name: + description: Name of the secret in the same namespace + as the Kepler deployment + minLength: 1 + type: string + readOnly: + default: true + description: ReadOnly specifies whether the secret should + be mounted read-only + type: boolean + required: + - mountPath + - name + type: object + type: array + x-kubernetes-list-type: atomic + security: + description: If set, defines the security mode and allowed + SANames + properties: + allowedSANames: + items: + type: string + type: array + x-kubernetes-list-type: atomic + mode: + enum: + - none + - rbac + type: string + type: object + tolerations: + default: + - effect: "" + key: "" + operator: Exists + value: "" + description: If specified, define Pod's tolerations + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object + type: object + required: + - kepler + type: object + status: + description: PowerMonitorStatus defines the observed state of Power Monitor + properties: + conditions: + description: conditions represent the latest available observations + of power-monitor + items: + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. + type: string + status: + description: status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of Kepler Condition - Reconciled, Available + ... + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-type: atomic + kepler: + properties: + currentNumberScheduled: + description: |- + The number of nodes that are running at least 1 power-monitor pod and are + supposed to run the power-monitor pod. + format: int32 + type: integer + desiredNumberScheduled: + description: |- + The total number of nodes that should be running the power-monitor + pod (including nodes correctly running the power-monitor pod). + format: int32 + type: integer + numberAvailable: + description: |- + The number of nodes that should be running the power-monitor pod and have one or + more of the power-monitor pod running and available + format: int32 + type: integer + numberMisscheduled: + description: |- + The number of nodes that are running the power-monitor pod, but are not supposed + to run the power-monitor pod. + format: int32 + type: integer + numberReady: + description: |- + numberReady is the number of nodes that should be running the power-monitor pod + and have one or more of the power-monitor pod running with a Ready Condition. + format: int32 + type: integer + numberUnavailable: + description: |- + The number of nodes that should be running the + power-monitor pod and have none of the power-monitor pod running and available + format: int32 + type: integer + updatedNumberScheduled: + description: The total number of nodes that are running updated + power-monitor pod + format: int32 + type: integer + required: + - currentNumberScheduled + - desiredNumberScheduled + - numberMisscheduled + - numberReady + type: object + required: + - conditions + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/manifests/helm/kepler-operator/templates/NOTES.txt b/manifests/helm/kepler-operator/templates/NOTES.txt new file mode 100644 index 00000000..ac72449c --- /dev/null +++ b/manifests/helm/kepler-operator/templates/NOTES.txt @@ -0,0 +1,40 @@ +Thank you for installing {{ .Chart.Name }}! + +Your release is named {{ .Release.Name }}. + +The Kepler Operator has been deployed in namespace: {{ include "kepler-operator.namespace" . }} + +To check the operator status: + + kubectl get pods -n {{ include "kepler-operator.namespace" . }} -l app.kubernetes.io/name={{ include "kepler-operator.name" . }} + +Next Steps: + +1. Verify the operator is running: + + kubectl get deployment -n {{ include "kepler-operator.namespace" . }} + +2. Check the CRDs are installed: + + kubectl get crds | grep powermonitor + +3. Create a PowerMonitor custom resource to deploy Kepler: + + kubectl apply -f https://raw.githubusercontent.com/sustainable-computing-io/kepler-operator/v1alpha1/config/samples/kepler.system_v1alpha1_powermonitor.yaml + +4. Verify Kepler DaemonSet is created: + + kubectl get daemonset -n {{ include "kepler-operator.deploymentNamespace" . }} + +{{- if .Values.webhooks.enabled }} +{{- if .Values.webhooks.certManager.enabled }} + +Note: This installation requires cert-manager to be installed for webhook certificates. +If you haven't installed cert-manager yet, install it with: + + kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.18.2/cert-manager.yaml +{{- end }} +{{- end }} + +For more information on using the Kepler Operator, visit: +https://github.com/sustainable-computing-io/kepler-operator diff --git a/manifests/helm/kepler-operator/templates/_helpers.tpl b/manifests/helm/kepler-operator/templates/_helpers.tpl new file mode 100644 index 00000000..f21a169d --- /dev/null +++ b/manifests/helm/kepler-operator/templates/_helpers.tpl @@ -0,0 +1,107 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "kepler-operator.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "kepler-operator.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "kepler-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "kepler-operator.labels" -}} +helm.sh/chart: {{ include "kepler-operator.chart" . }} +{{ include "kepler-operator.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/part-of: kepler-operator +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "kepler-operator.selectorLabels" -}} +app.kubernetes.io/name: {{ include "kepler-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Manager labels (for deployment and pod) +*/}} +{{- define "kepler-operator.managerLabels" -}} +{{ include "kepler-operator.selectorLabels" . }} +app.kubernetes.io/component: manager +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "kepler-operator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default "kepler-operator-controller-manager" .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + +{{/* +Create the namespace to use +*/}} +{{- define "kepler-operator.namespace" -}} +{{- default "kepler-operator-system" .Values.namespace }} +{{- end }} + +{{/* +Operator image +*/}} +{{- define "kepler-operator.image" -}} +{{- .Values.operator.image }} +{{- end }} + +{{/* +Kepler image (managed by operator) +*/}} +{{- define "kepler-operator.keplerImage" -}} +{{- .Values.kepler.image }} +{{- end }} + +{{/* +Kube RBAC Proxy image (managed by operator) +*/}} +{{- define "kepler-operator.kubeRbacProxyImage" -}} +{{- index .Values "kube-rbac-proxy" "image" }} +{{- end }} + +{{/* +Deployment namespace for power monitoring components +Defaults to "power-monitor" (the operator's code default) if not specified +*/}} +{{- define "kepler-operator.deploymentNamespace" -}} +{{- default "power-monitor" .Values.operator.deploymentNamespace }} +{{- end }} diff --git a/manifests/helm/kepler-operator/templates/certificate.yaml b/manifests/helm/kepler-operator/templates/certificate.yaml new file mode 100644 index 00000000..8b1e86c2 --- /dev/null +++ b/manifests/helm/kepler-operator/templates/certificate.yaml @@ -0,0 +1,31 @@ +{{- if .Values.webhooks.certManager.enabled }} +# Self-signed Issuer +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "kepler-operator.fullname" . }}-selfsigned-issuer + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: certificate +spec: + selfSigned: {} +--- +# Webhook TLS Certificate +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "kepler-operator.fullname" . }}-serving-cert + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: certificate +spec: + dnsNames: + - {{ include "kepler-operator.fullname" . }}-webhook-service.{{ include "kepler-operator.namespace" . }}.svc + - {{ include "kepler-operator.fullname" . }}-webhook-service.{{ include "kepler-operator.namespace" . }}.svc.cluster.local + issuerRef: + kind: Issuer + name: {{ include "kepler-operator.fullname" . }}-selfsigned-issuer + secretName: webhook-server-cert +{{- end }} diff --git a/manifests/helm/kepler-operator/templates/deployment.yaml b/manifests/helm/kepler-operator/templates/deployment.yaml new file mode 100644 index 00000000..b748c3d8 --- /dev/null +++ b/manifests/helm/kepler-operator/templates/deployment.yaml @@ -0,0 +1,93 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "kepler-operator.fullname" . }}-controller + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: manager +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "kepler-operator.managerLabels" . | nindent 6 }} + app.kubernetes.io/part-of: kepler-operator + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + labels: + {{- include "kepler-operator.managerLabels" . | nindent 8 }} + app.kubernetes.io/part-of: kepler-operator + spec: + serviceAccountName: {{ include "kepler-operator.serviceAccountName" . }} + securityContext: + runAsNonRoot: true + terminationGracePeriodSeconds: 10 + containers: + - name: manager + image: {{ include "kepler-operator.image" . }} + imagePullPolicy: {{ .Values.operator.pullPolicy }} + command: + - /manager + args: + {{- if .Values.operator.deploymentNamespace }} + - --deployment-namespace={{ .Values.operator.deploymentNamespace }} + {{- end }} + - --leader-elect + - --kepler.image=$(RELATED_IMAGE_KEPLER) + - --kube-rbac-proxy.image=$(RELATED_IMAGE_KUBE_RBAC_PROXY) + - --zap-log-level=5 + env: + - name: RELATED_IMAGE_KEPLER + value: {{ include "kepler-operator.keplerImage" . }} + - name: RELATED_IMAGE_KUBE_RBAC_PROXY + value: {{ include "kepler-operator.kubeRbacProxyImage" . }} + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + - containerPort: 8080 + name: metrics + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 20 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 20 + periodSeconds: 20 + resources: + {{- toYaml .Values.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + {{- if .Values.webhooks.enabled }} + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + {{- end }} + {{- if .Values.webhooks.enabled }} + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: webhook-server-cert + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/manifests/helm/kepler-operator/templates/rbac.yaml b/manifests/helm/kepler-operator/templates/rbac.yaml new file mode 100644 index 00000000..5b21aab9 --- /dev/null +++ b/manifests/helm/kepler-operator/templates/rbac.yaml @@ -0,0 +1,237 @@ +# Leader Election Role +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "kepler-operator.fullname" . }}-leader-election + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + - apiGroups: + - "" + resources: + - events + verbs: + - create + - patch +--- +# Manager ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kepler-operator.fullname" . }}-manager + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +rules: + - apiGroups: + - "" + resources: + - configmaps + - namespaces + - persistentvolumeclaims + - serviceaccounts + - services + verbs: + - create + - delete + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - nodes/metrics + - nodes/proxy + - nodes/stats + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - secrets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - "" + resources: + - serviceaccounts/token + verbs: + - create + - apiGroups: + - apps + resources: + - daemonsets + - deployments + verbs: + - create + - delete + - list + - patch + - update + - watch + - apiGroups: + - kepler.system.sustainable.computing.io + - rbac.authorization.k8s.io + resources: + - '*' + verbs: + - '*' + - apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + - servicemonitors + verbs: + - create + - delete + - list + - patch + - update + - watch + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - delete + - list + - patch + - update + - use + - watch + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +# Metrics Auth ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kepler-operator.fullname" . }}-metrics-auth + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +rules: + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +# Metrics Reader ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kepler-operator.fullname" . }}-metrics-reader + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +rules: + - nonResourceURLs: + - /metrics + verbs: + - get +--- +# Leader Election RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "kepler-operator.fullname" . }}-leader-election + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "kepler-operator.fullname" . }}-leader-election +subjects: + - kind: ServiceAccount + name: {{ include "kepler-operator.serviceAccountName" . }} + namespace: {{ include "kepler-operator.namespace" . }} +--- +# Manager ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "kepler-operator.fullname" . }}-manager + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "kepler-operator.fullname" . }}-manager +subjects: + - kind: ServiceAccount + name: {{ include "kepler-operator.serviceAccountName" . }} + namespace: {{ include "kepler-operator.namespace" . }} +--- +# Metrics Auth ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "kepler-operator.fullname" . }}-metrics-auth + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "kepler-operator.fullname" . }}-metrics-auth +subjects: + - kind: ServiceAccount + name: {{ include "kepler-operator.serviceAccountName" . }} + namespace: {{ include "kepler-operator.namespace" . }} diff --git a/manifests/helm/kepler-operator/templates/serviceaccount.yaml b/manifests/helm/kepler-operator/templates/serviceaccount.yaml new file mode 100644 index 00000000..5d7a2664 --- /dev/null +++ b/manifests/helm/kepler-operator/templates/serviceaccount.yaml @@ -0,0 +1,14 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "kepler-operator.serviceAccountName" . }} + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: rbac + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/manifests/helm/kepler-operator/templates/servicemonitor.yaml b/manifests/helm/kepler-operator/templates/servicemonitor.yaml new file mode 100644 index 00000000..5d9b2c7d --- /dev/null +++ b/manifests/helm/kepler-operator/templates/servicemonitor.yaml @@ -0,0 +1,16 @@ +{{- if .Values.metrics.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "kepler-operator.fullname" . }}-metrics-monitor + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics +spec: + endpoints: + - port: metrics + selector: + matchLabels: + control-plane: controller-manager +{{- end }} diff --git a/manifests/helm/kepler-operator/templates/services.yaml b/manifests/helm/kepler-operator/templates/services.yaml new file mode 100644 index 00000000..2a17d075 --- /dev/null +++ b/manifests/helm/kepler-operator/templates/services.yaml @@ -0,0 +1,36 @@ +# Metrics Service +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kepler-operator.fullname" . }}-metrics-service + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + control-plane: controller-manager +spec: + ports: + - name: metrics + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + {{- include "kepler-operator.managerLabels" . | nindent 4 }} + app.kubernetes.io/part-of: kepler-operator +--- +# Webhook Service +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kepler-operator.fullname" . }}-webhook-service + namespace: {{ include "kepler-operator.namespace" . }} + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: webhook +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + {{- include "kepler-operator.managerLabels" . | nindent 4 }} + app.kubernetes.io/part-of: kepler-operator diff --git a/manifests/helm/kepler-operator/templates/webhooks.yaml b/manifests/helm/kepler-operator/templates/webhooks.yaml new file mode 100644 index 00000000..36bacf71 --- /dev/null +++ b/manifests/helm/kepler-operator/templates/webhooks.yaml @@ -0,0 +1,70 @@ +{{- if .Values.webhooks.enabled }} +# Mutating Webhook Configuration +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: {{ include "kepler-operator.fullname" . }}-mutating-webhook-configuration + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: webhook + {{- if .Values.webhooks.certManager.enabled }} + annotations: + cert-manager.io/inject-ca-from: {{ include "kepler-operator.namespace" . }}/{{ include "kepler-operator.fullname" . }}-serving-cert + {{- end }} +webhooks: + - admissionReviewVersions: + - v1 + clientConfig: + service: + name: {{ include "kepler-operator.fullname" . }}-webhook-service + namespace: {{ include "kepler-operator.namespace" . }} + path: /mutate-kepler-system-sustainable-computing-io-v1alpha1-powermonitor + failurePolicy: Fail + name: mpowermonitor.kb.io + rules: + - apiGroups: + - kepler.system.sustainable.computing.io + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - powermonitors + sideEffects: None +--- +# Validating Webhook Configuration +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: {{ include "kepler-operator.fullname" . }}-validating-webhook-configuration + labels: + {{- include "kepler-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: webhook + {{- if .Values.webhooks.certManager.enabled }} + annotations: + cert-manager.io/inject-ca-from: {{ include "kepler-operator.namespace" . }}/{{ include "kepler-operator.fullname" . }}-serving-cert + {{- end }} +webhooks: + - admissionReviewVersions: + - v1 + clientConfig: + service: + name: {{ include "kepler-operator.fullname" . }}-webhook-service + namespace: {{ include "kepler-operator.namespace" . }} + path: /validate-kepler-system-sustainable-computing-io-v1alpha1-powermonitor + failurePolicy: Fail + name: vpowermonitor.kb.io + rules: + - apiGroups: + - kepler.system.sustainable.computing.io + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + - DELETE + resources: + - powermonitors + sideEffects: None +{{- end }} diff --git a/manifests/helm/kepler-operator/values.yaml b/manifests/helm/kepler-operator/values.yaml new file mode 100644 index 00000000..7789b200 --- /dev/null +++ b/manifests/helm/kepler-operator/values.yaml @@ -0,0 +1,62 @@ +# Default values for kepler-operator. + +# Operator +operator: + image: quay.io/sustainable_computing_io/kepler-operator:0.21.0 + pullPolicy: IfNotPresent + # Namespace where power monitoring components will be deployed + # Defaults to "power-monitor" if not specified + deploymentNamespace: "" + +# Managed Images (images that the operator will deploy) +kepler: + image: quay.io/sustainable_computing_io/kepler:v0.11.0 + +kube-rbac-proxy: + image: quay.io/brancz/kube-rbac-proxy:v0.19.0 + +# Deployment +replicaCount: 1 +namespace: kepler-operator +nameOverride: "" +fullnameOverride: "" + +# RBAC +serviceAccount: + create: true + name: kepler-operator-controller-manager + annotations: {} + +# Webhooks & cert-manager +webhooks: + enabled: true + certManager: + enabled: true # Requires cert-manager to be pre-installed + +# Monitoring +metrics: + enabled: true + serviceMonitor: + enabled: false # Set true if Prometheus Operator is available + +# Resources +resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + +# Tolerations, nodeSelector, affinity +tolerations: [] +nodeSelector: {} +affinity: {} + +# Security Context +securityContext: + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL diff --git a/tests/helm.sh b/tests/helm.sh new file mode 100755 index 00000000..9810a516 --- /dev/null +++ b/tests/helm.sh @@ -0,0 +1,380 @@ +#!/usr/bin/env bash +# Helm E2E testing script +# Tests the Kepler Operator Helm chart deployment end-to-end + +set -e -u -o pipefail + +PROJECT_ROOT="$(git rev-parse --show-toplevel)" +declare -r PROJECT_ROOT + +# Source test utilities +source "$PROJECT_ROOT/tests/utils.sh" + +# Script configuration +declare -r HELM_RELEASE_NAME="${HELM_RELEASE_NAME:-kepler-operator}" +declare -r HELM_NAMESPACE="${HELM_NAMESPACE:-kepler-operator}" +declare -r POWERMONITOR_NS="${POWERMONITOR_NS:-power-monitor}" +declare -r LOGS_DIR="${LOGS_DIR:-tmp/helm-e2e}" + +# Testdata paths +declare -r TESTDATA_DIR="$PROJECT_ROOT/tests/testdata" +declare -r POWERMONITOR_VM_YAML="$TESTDATA_DIR/powermonitor-vm.yaml" +declare -r POWERMONITOR_BAREMETAL_YAML="$TESTDATA_DIR/powermonitor-baremetal.yaml" +declare -r FAKE_CPU_CONFIGMAP_YAML="$TESTDATA_DIR/fake-cpu-configmap.yaml" + +# Image configuration +# NOTE: these are not readonly because it can be overridden by --flag +declare VERSION="${VERSION:-0.0.0-dev}" +declare IMG_BASE="${IMG_BASE:-localhost:5001}" +declare OPERATOR_IMG="$IMG_BASE/kepler-operator:$VERSION" + +# Managed image versions (what operator deploys) +declare -r KEPLER_IMAGE="${KEPLER_IMAGE:-quay.io/sustainable_computing_io/kepler:latest}" +declare -r KUBE_RBAC_PROXY_IMAGE="${KUBE_RBAC_PROXY_IMAGE:-quay.io/brancz/kube-rbac-proxy:v0.19.0}" + +# Script flags +declare NO_BUILD=false +declare NO_DEPLOY=false +declare CLEANUP=false +declare RUNNING_ON_VM=false +declare SHOW_USAGE=false + +# Trap cleanup on exit +trap cleanup_on_exit INT TERM + +cleanup_on_exit() { + cleanup_jobs + if $CLEANUP; then + uninstall_helm || true + fi +} + +# Build operator image +build_operator() { + header "Build Operator Image" + + $NO_BUILD && { + info "Skipping operator image build (--no-build)" + return 0 + } + + run make operator-build \ + VERSION="$VERSION" \ + IMG_BASE="$IMG_BASE" + + ok "Operator image built: $OPERATOR_IMG" +} + +# Load operator image to kind cluster +load_operator_image() { + header "Load Operator Image to Kind" + + $NO_BUILD && { + info "Skipping image load (--no-build)" + return 0 + } + + kind_load_image "$OPERATOR_IMG" + + ok "Operator image loaded to kind" +} + +# Install operator via Helm +install_helm() { + header "Install Operator via Helm" + + # Sync CRDs first + run make helm-sync-crds + + # Install via Helm + run helm upgrade --install "$HELM_RELEASE_NAME" \ + manifests/helm/kepler-operator \ + --namespace "$HELM_NAMESPACE" \ + --create-namespace \ + --set operator.image="$OPERATOR_IMG" \ + --set kepler.image="$KEPLER_IMAGE" \ + --set kube-rbac-proxy.image="$KUBE_RBAC_PROXY_IMAGE" \ + --timeout=5m \ + --wait + + ok "Operator installed via Helm" +} + +# Wait for webhook certificate to be ready +wait_for_webhook_cert() { + header "Waiting for Webhook Certificate" + + info "Waiting for webhook certificate to be issued..." + run kubectl wait --for=condition=Ready --timeout=300s \ + -n "$HELM_NAMESPACE" certificate/kepler-operator-serving-cert + + # Give webhook time to start with the certificate + sleep 10 + + ok "Webhook certificate ready" +} + +# Deploy PowerMonitor on VM with fake CPU meter +deploy_pm_on_vm() { + # Deploy PowerMonitor CR first (operator will create namespace) + info "Creating PowerMonitor resource with fake CPU meter" + kubectl apply -f "$POWERMONITOR_VM_YAML" + + # Wait for operator to create the namespace + info "Waiting for operator to create namespace $POWERMONITOR_NS" + kubectl wait --for=jsonpath='{.status.phase}'=Active \ + --timeout=60s namespace/"$POWERMONITOR_NS" 2>/dev/null || { + # Namespace might not exist yet, wait for it to be created + local retries=30 + while [[ $retries -gt 0 ]]; do + if kubectl get namespace "$POWERMONITOR_NS" >/dev/null 2>&1; then + break + fi + sleep 2 + ((retries--)) + done + } + + # Create fake CPU meter ConfigMap after namespace exists + info "Creating fake CPU meter ConfigMap" + kubectl apply -n "$POWERMONITOR_NS" -f "$FAKE_CPU_CONFIGMAP_YAML" +} + +# Deploy PowerMonitor on bare metal with hardware sensors +deploy_pm_on_baremetal() { + info "Creating PowerMonitor resource (using hardware sensors)" + kubectl apply -f "$POWERMONITOR_BAREMETAL_YAML" +} + +# Deploy PowerMonitor +deploy_powermonitor() { + header "Deploy PowerMonitor" + + if $RUNNING_ON_VM; then + deploy_pm_on_vm + else + deploy_pm_on_baremetal + fi + + # Wait for PowerMonitor to be ready + wait_for_powermonitor power-monitor + + ok "PowerMonitor deployed successfully" +} + +# Verify deployment +verify_deployment() { + header "Verify Deployment" + + # Check operator deployment + info "Verifying operator deployment..." + kubectl get deployment -n "$HELM_NAMESPACE" kepler-operator-controller + + # Check PowerMonitor DaemonSet + info "Verifying PowerMonitor DaemonSet..." + kubectl get daemonset -n "$POWERMONITOR_NS" power-monitor + + # Check pods are running + info "Checking PowerMonitor pods..." + kubectl get pods -n "$POWERMONITOR_NS" + + ok "All components verified" +} + +# Uninstall Helm release +uninstall_helm() { + header "Uninstall Helm Release" + + # Delete PowerMonitor first + kubectl delete powermonitor power-monitor --ignore-not-found=true || true + sleep 5 + + # Uninstall Helm release + run helm uninstall "$HELM_RELEASE_NAME" \ + --namespace "$HELM_NAMESPACE" || true + + ok "Helm release uninstalled" +} + +# Parse command line arguments +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + -h | --help) + SHOW_USAGE=true + return 0 + ;; + --no-build) + NO_BUILD=true + shift + ;; + --no-deploy) + NO_DEPLOY=true + shift + ;; + --cleanup) + CLEANUP=true + shift + ;; + --running-on-vm) + RUNNING_ON_VM=true + shift + ;; + --version) + shift + VERSION="$1" + OPERATOR_IMG="$IMG_BASE/kepler-operator:$VERSION" + shift + ;; + --version=*) + VERSION="${1#*=}" + OPERATOR_IMG="$IMG_BASE/kepler-operator:$VERSION" + shift + ;; + *) + err "Unknown option: $1" + SHOW_USAGE=true + return 1 + ;; + esac + done + return 0 +} + +# Show usage +show_usage() { + local scr + scr="$(basename "$0")" + + cat <<-EOF + ๐Ÿ”† Usage: + $scr [OPTIONS] + + ๐Ÿ“‹ Description: + Run Helm E2E tests for the Kepler Operator + + ๐Ÿ’ก Examples: + # Full flow: build, load, deploy, verify + โฏ $scr + + # Run in CI/VM environment (enables fake CPU meter) + โฏ $scr --running-on-vm + + # Use existing image (skip build) + โฏ $scr --no-build --version=0.21.0 + + # Quick iteration (skip deploy, just verify) + โฏ $scr --no-deploy + + # Full flow with cleanup + โฏ $scr --cleanup + + โš™๏ธ Options: + -h, --help Show this help + --no-build Skip building operator image + --no-deploy Skip deployment (assumes operator already installed) + --cleanup Uninstall Helm release after test + --running-on-vm Enable fake CPU meter (for VMs without hardware sensors) + --version VER Operator version to test (default: $VERSION) + + ๐Ÿ“ Prerequisites: + - Kubernetes cluster running (kind recommended) + - cert-manager installed (run 'make cluster-up') + - helm, kubectl, docker available + + ๐Ÿ“‚ Logs: + Test logs are saved to: $LOGS_DIR + EOF + + return 0 +} + +# Print test configuration +print_config() { + header "Test Configuration" + cat <<-EOF + Operator Image: $OPERATOR_IMG + Kepler Image: $KEPLER_IMAGE + Kube RBAC Proxy: $KUBE_RBAC_PROXY_IMAGE + Helm Release: $HELM_RELEASE_NAME + Helm Namespace: $HELM_NAMESPACE + PowerMonitor NS: $POWERMONITOR_NS + Skip Build: $NO_BUILD + Skip Deploy: $NO_DEPLOY + Running on VM: $RUNNING_ON_VM + Cleanup After: $CLEANUP + Logs Directory: $LOGS_DIR + + EOF + line 50 +} + +# Main test flow +main() { + export PATH="$LOCAL_BIN:$PATH" + + # Parse arguments + parse_args "$@" || { + show_usage + return 1 + } + + if $SHOW_USAGE; then + show_usage + return 0 + fi + + cd "$PROJECT_ROOT" + + # Initialize logs directory + init_logs_dir "$LOGS_DIR" + + # Print configuration + print_config + + # Start background event logging + log_events "$HELM_NAMESPACE" "$LOGS_DIR/operator-events.log" & + log_events "$POWERMONITOR_NS" "$LOGS_DIR/powermonitor-events.log" & + + local ret=0 + + # Run test flow + if ! $NO_DEPLOY; then + build_operator || ret=$? + [[ $ret -ne 0 ]] && return $ret + + load_operator_image || ret=$? + [[ $ret -ne 0 ]] && return $ret + + install_helm || ret=$? + [[ $ret -ne 0 ]] && return $ret + + wait_for_webhook_cert || ret=$? + [[ $ret -ne 0 ]] && return $ret + + wait_for_operator "$HELM_NAMESPACE" "kepler-operator-controller" || ret=$? + [[ $ret -ne 0 ]] && return $ret + + deploy_powermonitor || ret=$? + [[ $ret -ne 0 ]] && return $ret + fi + + verify_deployment || ret=$? + + # Cleanup background jobs + cleanup_jobs + + # Always gather cluster state after test run (for debugging) + gather_cluster_state "$LOGS_DIR" "$HELM_NAMESPACE" + + if [[ $ret -eq 0 ]]; then + ok "โœ… Helm E2E Tests Passed" + else + fail "โŒ Helm E2E Tests Failed" + info "Check logs in: $LOGS_DIR" + fi + + return $ret +} + +main "$@" diff --git a/tests/run-e2e.sh b/tests/run-e2e.sh index 1c3d4914..fb870037 100755 --- a/tests/run-e2e.sh +++ b/tests/run-e2e.sh @@ -19,7 +19,7 @@ declare -r OPERATOR_DEPLOY_NAME="kepler-operator-controller" declare -r OPERATOR_RELEASED_BUNDLE="quay.io/sustainable_computing_io/$OPERATOR-bundle" declare -r TEST_IMAGES_YAML="tests/images.yaml" -declare IMG_BASE="${IMG_BASE:-localhost:5001/$OPERATOR}" +declare IMG_BASE="${IMG_BASE:-localhost:5001}" # NOTE: this vars are initialized in init_operator_img declare OPERATOR_IMG="" declare BUNDLE_IMG="" diff --git a/tests/testdata/fake-cpu-configmap.yaml b/tests/testdata/fake-cpu-configmap.yaml new file mode 100644 index 00000000..c2bd66eb --- /dev/null +++ b/tests/testdata/fake-cpu-configmap.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: power-monitor-config +data: + config.yaml: | + dev: + fake-cpu-meter: + enabled: true diff --git a/tests/testdata/powermonitor-baremetal.yaml b/tests/testdata/powermonitor-baremetal.yaml new file mode 100644 index 00000000..2a53c2cb --- /dev/null +++ b/tests/testdata/powermonitor-baremetal.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: kepler.system.sustainable.computing.io/v1alpha1 +kind: PowerMonitor +metadata: + name: power-monitor +spec: + kepler: + deployment: + nodeSelector: + kubernetes.io/os: linux + config: + logLevel: info diff --git a/tests/testdata/powermonitor-vm.yaml b/tests/testdata/powermonitor-vm.yaml new file mode 100644 index 00000000..877862e3 --- /dev/null +++ b/tests/testdata/powermonitor-vm.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: kepler.system.sustainable.computing.io/v1alpha1 +kind: PowerMonitor +metadata: + name: power-monitor +spec: + kepler: + deployment: + nodeSelector: + kubernetes.io/os: linux + config: + logLevel: info + additionalConfigMaps: + - name: power-monitor-config diff --git a/tests/utils.sh b/tests/utils.sh new file mode 100644 index 00000000..effc9cc0 --- /dev/null +++ b/tests/utils.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# Shared test utilities for e2e tests +# This file contains common functions used by both run-e2e.sh and helm.sh + +# Ensure PROJECT_ROOT is set +if [[ -z "${PROJECT_ROOT:-}" ]]; then + PROJECT_ROOT="$(git rev-parse --show-toplevel)" + declare -r PROJECT_ROOT +fi + +# Source basic utilities +source "$PROJECT_ROOT/hack/utils.bash" + +# Common test variables +declare -r LOCAL_BIN="${LOCAL_BIN:-$PROJECT_ROOT/tmp/bin}" +declare -r OPERATOR_DEPLOY_NAME="${OPERATOR_DEPLOY_NAME:-kepler-operator-controller}" + +# Initialize logs directory +# Creates a new logs directory and moves the old one to -prev +init_logs_dir() { + local logs_dir="${1:-tmp/e2e}" + + rm -rf "$logs_dir-prev" + mv "$logs_dir" "$logs_dir-prev" 2>/dev/null || true + mkdir -p "$logs_dir" +} + +# Load a docker image into kind cluster +kind_load_image() { + local img="$1" + + # Check if image exists locally first + if ! docker image inspect "$img" &>/dev/null; then + # Image not local, try to pull it + run docker pull "$img" + fi + + run kind load docker-image "$img" +} + +# Log kubernetes events for a namespace +# Usage: log_events +log_events() { + local ns="$1" + local log_file="${2:-events.log}" + + kubectl get events -w \ + -o custom-columns=FirstSeen:.firstTimestamp,LastSeen:.lastTimestamp,Count:.count,From:.source.component,Type:.type,Reason:.reason,Message:.message \ + -n "$ns" | tee "$log_file" +} + +# Wait for operator deployment to be ready +# Works for both OLM and Helm deployments +# Usage: wait_for_operator [deployment-name] +wait_for_operator() { + local ns="$1" + local deploy_name="${2:-$OPERATOR_DEPLOY_NAME}" + local deployment="deploy/$deploy_name" + + header "Waiting for Kepler Operator ($ns) to be Ready" + + wait_until 30 10 "operator to run" \ + kubectl -n "$ns" rollout status "$deployment" + + run kubectl wait -n "$ns" --for=condition=Available \ + --timeout=300s "$deployment" + + ok "Operator up and running" +} + +# Wait for PowerMonitor to be available +# Usage: wait_for_powermonitor +wait_for_powermonitor() { + local pm_name="${1:-power-monitor}" + + header "Waiting for PowerMonitor to be ready" + wait_until 10 10 "powermonitor to be available" condition_check "True" kubectl get powermonitor "$pm_name" \ + -o jsonpath="{.status.conditions[?(@.type=='Available')].status}" || { + fail "PowerMonitor is not ready" + return 1 + } + ok "PowerMonitor is ready" + return 0 +} + +# Create ConfigMap to enable fake CPU meter for testing +# Usage: create_fake_cpu_configmap [configmap-name] +create_fake_cpu_configmap() { + local ns="$1" + local cm_name="${2:-power-monitor-config}" + + info "Creating fake CPU meter ConfigMap in namespace $ns" + kubectl create namespace "$ns" 2>/dev/null || true + kubectl apply -n "$ns" -f - </dev/null || true + return 0 +} + +# Update CRDs +# Usage: update_crds +update_crds() { + info "Updating CRDs..." + run kubectl apply --server-side --force-conflicts -k config/crd + run kubectl wait --for=condition=Established crds --all --timeout=120s + return 0 +} + +# Gather cluster state for debugging +# Usage: gather_cluster_state +gather_cluster_state() { + local output_dir="$1" + local ns="${2:-}" + + mkdir -p "$output_dir" + + info "Gathering cluster state to $output_dir" + + # All resources + kubectl get all -A >"$output_dir/all-resources.txt" 2>&1 || true + + # Events + kubectl get events -A >"$output_dir/events.txt" 2>&1 || true + + # PowerMonitor resources + kubectl get powermonitor -o yaml >"$output_dir/powermonitor.yaml" 2>&1 || true + kubectl get powermonitorinternal -o yaml >"$output_dir/powermonitorinternal.yaml" 2>&1 || true + + # Operator logs if namespace provided + if [[ -n "$ns" ]]; then + kubectl logs -n "$ns" -l app.kubernetes.io/component=manager --tail=200 \ + >"$output_dir/operator-logs.txt" 2>&1 || true + kubectl describe deployment -n "$ns" "$OPERATOR_DEPLOY_NAME" \ + >"$output_dir/operator-deployment.txt" 2>&1 || true + fi + + ok "Cluster state gathered" +} diff --git a/tests/utils/framework.go b/tests/utils/framework.go index 690f82ab..f3855d23 100644 --- a/tests/utils/framework.go +++ b/tests/utils/framework.go @@ -593,7 +593,7 @@ func (f Framework) DeployOpenshiftCerts(serviceName, serviceNamespace, clusterIs func (f Framework) InstallCertManager() { f.T.Helper() - _, err := oc.Literal().From("kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml").Run() + _, err := oc.Literal().From("kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.18.2/cert-manager.yaml").Run() assert.NoError(f.T, err, "failed to install cert-manager") f.WaitUntil("cert-manager pods are running", func(ctx context.Context) (bool, error) { @@ -612,7 +612,7 @@ func (f Framework) InstallCertManager() { }, Timeout(5*time.Minute)) f.T.Cleanup(func() { - _, err := oc.Literal().From("kubectl delete -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.yaml").Run() + _, err := oc.Literal().From("kubectl delete -f https://github.com/cert-manager/cert-manager/releases/download/v1.18.2/cert-manager.yaml").Run() assert.NoError(f.T, err, "failed to uninstall cert-manager") }) }