Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/configs/helm-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

# See https://github.com/helm/chart-testing#configuration
remote: origin
target-branch: main
validate-maintainers: false
check-version-increment: false
chart-dirs:
- charts
27 changes: 0 additions & 27 deletions .github/workflows/codeql.yaml

This file was deleted.

66 changes: 66 additions & 0 deletions .github/workflows/helm-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: "Helm lint and tests"
on:
pull_request:
types: [ opened, synchronize, reopened ]

env:
REGISTRY: ghcr.io

jobs:
helm-lint-test:
runs-on: [ default ]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}

- name: Set up Helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0

- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5
with:
python-version: 3.9
check-latest: true
token: ${{ secrets.GITHUB_TOKEN }}

- name: Set up chart-linting and chart-testing
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0

- name: Run chart-linting (list-changed)
id: list-changed
run: |
changed=$(ct list-changed --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }})
if [[ -n "$changed" ]]; then
echo "changed=true" >> "$GITHUB_OUTPUT"
fi

- name: Run chart-linting
if: steps.list-changed.outputs.changed == 'true'
run: ct lint --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }}

- name: Check version bump
id: check-bump
if: steps.list-changed.outputs.changed == 'true'
continue-on-error: true
run: |
for chart in $(ct list-changed --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }}); do
chart_version=$(yq .version "$chart/Chart.yaml")
if helm pull "oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/$(dirname $chart)" --version $chart_version; then
echo "chart=$(dirname $chart)" >> "$GITHUB_OUTPUT"
echo "chart_version=${chart_version}" >> "$GITHUB_OUTPUT"
echo "needsbump=true" >> "$GITHUB_OUTPUT"
fi
done

- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7
if: steps.check-bump.outputs.needsbump == 'true'
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: ':warning: Chart `oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/${{ steps.check-bump.outputs.chart }}:${{ steps.check-bump.outputs.chart_version }}` already exists in OCI registry. Please increment the chart version.'
})
core.setFailed(`Action failed with error: Chart version bump required`);
99 changes: 99 additions & 0 deletions .github/workflows/helm-release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
name: Package Helm Chart and publish to GitHub Packages

on:
workflow_dispatch: {}
push:
branches:
- main
paths:
- .github/workflows/helm-release.yaml
- charts/**

permissions:
contents: write
packages: write

env:
REGISTRY: ghcr.io
ACTIONS_RUNNER_DEBUG: false

jobs:
helm-release:
runs-on: [ default ]
strategy:
fail-fast: false
matrix:
include:
- chartDir: charts/controlplane-operations
chartName: controlplane-operations

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}

- name: Configure Git
run: |
git config user.name "$GITHUB_ACTOR"
git config user.email "$GITHUB_ACTOR@users.noreply.github.com"

- name: Set up Helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0

- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5
with:
python-version: 3.9
check-latest: true
token: ${{ secrets.GITHUB_TOKEN }}

- name: Log into registry ${{ env.REGISTRY }}
if: github.event_name != 'pull_request'
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@2f7c5bfce28377bc069a65ba478de0a74aa0ca32 # v46
with:
files: |
${{ matrix.chartDir }}/**

- name: Check if Helm chart with same version already exists
id: check-chart
if: steps.changed-files.outputs.all_changed_files != ''
env:
ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
run: |
# List all changed
echo "All changed files: $ALL_CHANGED_FILES"
chartName=$(yq .name "${{ matrix.chartDir }}/Chart.yaml")
chartVersion=$(yq .version "${{ matrix.chartDir }}/Chart.yaml")
echo "chart_version=${chartVersion}" >> "$GITHUB_OUTPUT"
if $(helm pull "oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/${{ matrix.chartName }}" --version $chartVersion); then
echo "bump=true" >> "$GITHUB_OUTPUT"
fi

- name: Chart needs version bump
if: steps.check-chart.outputs.bump == 'true'
env:
CHART_VERSION: ${{ steps.check-chart.outputs.chart_version }}
run: |
echo "Chart ${{ matrix.chartDir }}:${{ env.CHART_VERSION }} already exists in OCI registry. Skipping upload. Please increment the chart version."
exit 1

- name: Push Charts to GHCR
if: steps.changed-files.outputs.all_changed_files != '' && steps.check-chart.outputs.bump != 'true'
run: |
helm package ${{ matrix.chartDir }} -d ${{ matrix.chartDir }}
PKG_NAME=`ls ${{ matrix.chartDir }}/*.tgz`
helm push ${PKG_NAME} oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/

- name: Run chart-releaser
uses: helm/chart-releaser-action@cae68fefc6b5f367a0275617c9f83181ba54714f # v1.7.0
env:
CR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CR_GENERATE_RELEASE_NOTES: true
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,29 @@

A set of Plutono dashboards and Prometheus alert rules combined with playbooks to ensure effective operation of Controlplane clusters within the ApeiroRA stack.

# Content

The content is structured as follows:

```
controlplane-operations
├── playbooks/ Step-by-step instructions for troubleshooting.
└── charts/
└── controlplane-operations
├── alerts Prometheus alerts for kubernetes.
├── dashboards Plutono dashboards for visualizing key metrics.
└── Chart.yaml Helm chart manifest.
```

## Requirements and Setup

*Insert a short description what is required to get your project running...*
The content of the repository can be installed independently or as part of the [greenhouse-extensions](https://github.com/cloudoperators/greenhouse-extensions/tree/main/kube-monitoring).

## Support, Feedback, Contributing

Expand Down
17 changes: 16 additions & 1 deletion REUSE.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,17 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0

# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

version = 1
SPDX-PackageName = "controlplane-operations"
SPDX-PackageSupplier = "Vladimir Videlov <vladimir.videlov@sap.com>"
SPDX-PackageDownloadLocation = "https://github.com/cloudoperators/controlplane-operations"
SPDX-PackageComment = "The code in this project may include calls to APIs (\"API Calls\") of\n SAP or third-party products or services developed outside of this project\n (\"External Products\").\n \"APIs\" means application programming interfaces, as well as their respective\n specifications and implementing code that allows software to communicate with\n other software.\n API Calls to External Products are not licensed under the open source license\n that governs this project. The use of such API Calls and related External\n Products are subject to applicable additional agreements with the relevant\n provider of the External Products. In no event shall the open source license\n that governs this project grant any rights in or to any External Products,or\n alter, expand or supersede any terms of the applicable additional agreements.\n If you have a valid license agreement with SAP for the use of a particular SAP\n External Product, then you may make use of any API Calls included in this\n project's code for that SAP External Product, subject to the terms of such\n license agreement. If you do not have a valid license agreement for the use of\n a particular SAP External Product, then you may only make use of any API Calls\n in this project for that SAP External Product for your internal, non-productive\n and non-commercial test and evaluation of such API Calls. Nothing herein grants\n you any rights to use or access any SAP External Product, or provide any third\n parties the right to use of access any SAP External Product, through API Calls."

[[annotations]]
path = "**"
precedence = "aggregate"
SPDX-FileCopyrightText = "2025 SAP SE or an SAP affiliate company and Greenhouse contributors"
SPDX-License-Identifier = "Apache-2.0"
17 changes: 17 additions & 0 deletions charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

apiVersion: v2
name: controlplane-operations
version: 1.0.0
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
email: vladimir.videlov@sap.com
keywords:
- Helm Chart
- Controlplane operations
- Plutono Dashboards
- Prometheus Alerting
- Alert Rules
- Playbooks
36 changes: 36 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-bond.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

groups:
- name: controlplane.bond.alerts
rules:
### Bonding health ###
- alert: NodeBondDegradedMain
expr: sum(node_bonding_active) by (master, node) < 2
for: 15m
labels:
tier: k8s
service: node
severity: warning
context: bond
meta: "`{{`{{ $labels.node }}`}}`"
support_group: containers
playbook: "docs/support/playbook/kubernetes/k8s_bond_degraded"
annotations:
description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node.
summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage!

- alert: NodeVirtualInterfaceDown
expr: sum(node_network_up{device=~"bond.*|vlan.*"} == 0) by (node, device)
for: 15m
labels:
tier: k8s
service: node
severity: warning
context: bond
meta: "`{{`{{ $labels.node }}`}}`"
support_group: containers
playbook: "docs/support/playbook/kubernetes/k8s_node_interface_down"
annotations:
description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node.
summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded. Check ESX node state in vCenter.
21 changes: 21 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-node.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

groups:
- name: controlplane.node.alerts
rules:
### Node Bridge ###
- alert: KubernetesNodeBridgeFilterVLANTagged
expr: kube_node_status_condition{condition="BridgeFilterVLANTagged", status="true"} == 1
for: 15m
labels:
tier: k8s
service: node
severity: info
context: label
meta: "`{{`{{ $labels.node }}`}}`"
support_group: containers
playbook: "docs/support/playbook/kubernetes/k8s_node_bridge_filter_iptables"
annotations:
description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened!
summary: Bridged VLAN-tagged traffic is filtered by IPtables.
41 changes: 41 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

groups:
- name: controlplane.pvc
rules:
- record: kubelet_volume_stats_available_percent
expr: '(min by (persistentvolumeclaim, namespace) (100 * kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes)) * on(persistentvolumeclaim, namespace) group_left(label_ccloud_support_group, label_ccloud_service) (max by (persistentvolumeclaim, namespace, label_ccloud_support_group, label_ccloud_service) (kube_persistentvolumeclaim_labels))'

- name: controlplane.pvc.alerts
rules:
### PVC usage ###
- alert: KubernetesPVCNoSpaceLeft
expr: kubelet_volume_stats_available_percent < 10
Comment thread
videlov marked this conversation as resolved.
Outdated
for: 10m
labels:
tier: k8s
support_group: "`{{`{{ if $labels.label_ccloud_support_group }}`}}``{{`{{ $labels.label_ccloud_support_group }}`}}``{{`{{ else }}`}}`containers`{{`{{ end }}`}}`"
service: "`{{`{{ if $labels.label_ccloud_service }}`}}``{{`{{ $labels.label_ccloud_service }}`}}``{{`{{ else }}`}}`resources`{{`{{ end }}`}}`"
severity: info
context: storage
meta: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%."
playbook: 'docs/support/playbook/kubernetes/pvc_usage'
annotations:
description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files."
summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%."

- alert: KubernetesPVCNoSpaceLeft
expr: kubelet_volume_stats_available_percent < 2
for: 10m
labels:
tier: k8s
support_group: "`{{`{{ if $labels.label_ccloud_support_group }}`}}``{{`{{ $labels.label_ccloud_support_group }}`}}``{{`{{ else }}`}}`containers`{{`{{ end }}`}}`"
service: "`{{`{{ if $labels.label_ccloud_service }}`}}``{{`{{ $labels.label_ccloud_service }}`}}``{{`{{ else }}`}}`resources`{{`{{ end }}`}}`"
severity: warning
context: storage
meta: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%."
playbook: "docs/support/playbook/kubernetes/pvc_usage"
annotations:
description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage."
summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%."
Loading
Loading