Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/configs/helm-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

# See https://github.com/helm/chart-testing#configuration
remote: origin
target-branch: main
validate-maintainers: false
check-version-increment: false
chart-dirs:
- charts
26 changes: 6 additions & 20 deletions .github/licenserc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ header:
license:
spdx-id: Apache-2.0
content: |
SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
SPDX-FileCopyrightText: [year] SAP SE or an SAP affiliate company and Greenhouse contributors
SPDX-License-Identifier: Apache-2.0
pattern: |
SPDX-FileCopyrightText: [0-9]+ SAP SE or an SAP affiliate company and Greenhouse contributors
SPDX-License-Identifier: Apache-2\.0

paths: # `paths` are the path list that will be checked (and fixed) by license-eye, default is ['**'].
- '**'
Expand All @@ -12,34 +15,17 @@ header:
- '.github/**'
- '.reuse/dep5'
- 'LICENSES/*.txt'
- 'grafana/*.json'
- 'website/scripts/**'
- 'hack/**'
- 'hack/openapi-generator/openapi-info.yaml'
- 'hack/boilerplate.go.txt'
- 'docs/**'
- '**/*.md'
- 'LICENSE'
- 'NOTICE'
- 'PROJECT'
- '**/*.gitignore'
- '**/*.helmignore'
- '**/*.tpl'
- '**/go.mod'
- '**/go.sum'
- '**/*.lock'
- '**/*.json'
- '**/.gitkeep'
- '**/*.txt'
- '*Dockerfile*'
- 'Makefile'
- 'pkg/idproxy/web/**'
- 'pkg/apis/scheme_builder.go' # Belongs to the Kubernetes authors
- 'cmd/tcp-proxy/main.go' # MIT License
- 'pkg/tcp-proxy/proxy/*.go' # MIT License
- '**/zz_generated.deepcopy.go' # Generated by Kubebuilder
- 'charts/**/templates/*.yaml' # license headers on helm templates are causing issues

- 'README.md.gotmpl'
- 'charts/**/**/*.yaml' # license headers on helm templates are causing issues

comment: on-failure

Expand Down
27 changes: 0 additions & 27 deletions .github/workflows/codeql.yaml

This file was deleted.

66 changes: 66 additions & 0 deletions .github/workflows/helm-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
name: "Helm lint and tests"
on:
pull_request:
types: [ opened, synchronize, reopened ]

env:
REGISTRY: ghcr.io

jobs:
helm-lint-test:
runs-on: [ default ]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}

- name: Set up Helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0

- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5
with:
python-version: 3.9
check-latest: true
token: ${{ secrets.GITHUB_TOKEN }}

- name: Set up chart-linting and chart-testing
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0

- name: Run chart-linting (list-changed)
id: list-changed
run: |
changed=$(ct list-changed --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }})
if [[ -n "$changed" ]]; then
echo "changed=true" >> "$GITHUB_OUTPUT"
fi

- name: Run chart-linting
if: steps.list-changed.outputs.changed == 'true'
run: ct lint --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }}

- name: Check version bump
id: check-bump
if: steps.list-changed.outputs.changed == 'true'
continue-on-error: true
run: |
for chart in $(ct list-changed --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }}); do
chart_version=$(yq .version "$chart/Chart.yaml")
if helm pull "oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/$(dirname $chart)" --version $chart_version; then
echo "chart=$(dirname $chart)" >> "$GITHUB_OUTPUT"
echo "chart_version=${chart_version}" >> "$GITHUB_OUTPUT"
echo "needsbump=true" >> "$GITHUB_OUTPUT"
fi
done

- uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7
if: steps.check-bump.outputs.needsbump == 'true'
with:
script: |
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: ':warning: Chart `oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/${{ steps.check-bump.outputs.chart }}:${{ steps.check-bump.outputs.chart_version }}` already exists in OCI registry. Please increment the chart version.'
})
core.setFailed(`Action failed with error: Chart version bump required`);
99 changes: 99 additions & 0 deletions .github/workflows/helm-release.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
name: Package Helm Chart and publish to GitHub Packages

on:
workflow_dispatch: {}
push:
branches:
- main
paths:
- .github/workflows/helm-release.yaml
- charts/**

permissions:
contents: write
packages: write

env:
REGISTRY: ghcr.io
ACTIONS_RUNNER_DEBUG: false

jobs:
helm-release:
runs-on: [ default ]
strategy:
fail-fast: false
matrix:
include:
- chartDir: charts/controlplane-operations
chartName: controlplane-operations

steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}

- name: Configure Git
run: |
git config user.name "$GITHUB_ACTOR"
git config user.email "[email protected]"

- name: Set up Helm
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0

- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5
with:
python-version: 3.9
check-latest: true
token: ${{ secrets.GITHUB_TOKEN }}

- name: Log into registry ${{ env.REGISTRY }}
if: github.event_name != 'pull_request'
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Get changed files
id: changed-files
uses: tj-actions/changed-files@2f7c5bfce28377bc069a65ba478de0a74aa0ca32 # v46
with:
files: |
${{ matrix.chartDir }}/**

- name: Check if Helm chart with same version already exists
id: check-chart
if: steps.changed-files.outputs.all_changed_files != ''
env:
ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
run: |
# List all changed
echo "All changed files: $ALL_CHANGED_FILES"
chartName=$(yq .name "${{ matrix.chartDir }}/Chart.yaml")
chartVersion=$(yq .version "${{ matrix.chartDir }}/Chart.yaml")
echo "chart_version=${chartVersion}" >> "$GITHUB_OUTPUT"
if $(helm pull "oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/${{ matrix.chartName }}" --version $chartVersion); then
echo "bump=true" >> "$GITHUB_OUTPUT"
fi

- name: Chart needs version bump
if: steps.check-chart.outputs.bump == 'true'
env:
CHART_VERSION: ${{ steps.check-chart.outputs.chart_version }}
run: |
echo "Chart ${{ matrix.chartDir }}:${{ env.CHART_VERSION }} already exists in OCI registry. Skipping upload. Please increment the chart version."
exit 1

- name: Push Charts to GHCR
if: steps.changed-files.outputs.all_changed_files != '' && steps.check-chart.outputs.bump != 'true'
run: |
helm package ${{ matrix.chartDir }} -d ${{ matrix.chartDir }}
PKG_NAME=`ls ${{ matrix.chartDir }}/*.tgz`
helm push ${PKG_NAME} oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/

- name: Run chart-releaser
uses: helm/chart-releaser-action@cae68fefc6b5f367a0275617c9f83181ba54714f # v1.7.0
env:
CR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
CR_GENERATE_RELEASE_NOTES: true
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,29 @@

A set of Perses dashboards and Prometheus alert rules combined with playbooks to ensure effective operation of Controlplane clusters within the ApeiroRA stack.

# Content

The content is structured as follows:

```
controlplane-operations
├── playbooks/ Step-by-step instructions for troubleshooting.
└── charts/
└── controlplane-operations
├── alerts Prometheus alerts for kubernetes.
├── dashboards Perses dashboards for visualizing key metrics.
└── Chart.yaml Helm chart manifest.
```

## Requirements and Setup

*Insert a short description what is required to get your project running...*
The content of the repository can be installed as a [Greenhouse](https://github.com/cloudoperators/greenhouse) Plugin.

## Support, Feedback, Contributing

Expand Down
16 changes: 14 additions & 2 deletions REUSE.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and Greenhouse contributors
# SPDX-License-Identifier: Apache-2.0

version = 1
SPDX-PackageName = "controlplane-operations"
SPDX-PackageSupplier = "Vladimir Videlov <[email protected]>"
SPDX-PackageDownloadLocation = "https://github.com/cloudoperators/controlplane-operations"
SPDX-PackageComment = "The code in this project may include calls to APIs (\"API Calls\") of\n SAP or third-party products or services developed outside of this project\n (\"External Products\").\n \"APIs\" means application programming interfaces, as well as their respective\n specifications and implementing code that allows software to communicate with\n other software.\n API Calls to External Products are not licensed under the open source license\n that governs this project. The use of such API Calls and related External\n Products are subject to applicable additional agreements with the relevant\n provider of the External Products. In no event shall the open source license\n that governs this project grant any rights in or to any External Products,or\n alter, expand or supersede any terms of the applicable additional agreements.\n If you have a valid license agreement with SAP for the use of a particular SAP\n External Product, then you may make use of any API Calls included in this\n project's code for that SAP External Product, subject to the terms of such\n license agreement. If you do not have a valid license agreement for the use of\n a particular SAP External Product, then you may only make use of any API Calls\n in this project for that SAP External Product for your internal, non-productive\n and non-commercial test and evaluation of such API Calls. Nothing herein grants\n you any rights to use or access any SAP External Product, or provide any third\n parties the right to use of access any SAP External Product, through API Calls."

[[annotations]]
path = "**"
precedence = "aggregate"
SPDX-FileCopyrightText = "2025 SAP SE or an SAP affiliate company and Greenhouse contributors"
SPDX-License-Identifier = "Apache-2.0"
14 changes: 14 additions & 0 deletions charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v2
name: controlplane-operations
version: 1.0.1
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
email: [email protected]
keywords:
- Helm Chart
- Controlplane operations
- Plutono Dashboards
- Prometheus Alerting
- Alert Rules
- Playbooks
28 changes: 28 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-bond.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
groups:
- name: controlplane-bond
rules:
{{- if not (.Values.prometheusRules.disabled.NodeBondDegradedMain | default false) }}
- alert: NodeBondDegradedMain
expr: sum(node_bonding_active) by (master, node) < 2
for: {{ dig "NodeBondDegradedMain" "for" "15m" .Values.prometheusRules }}
labels:
severity: {{ dig "NodeBondDegradedMain" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeBondDegradedMain.md
{{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }}
annotations:
description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node.
summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage!
{{- end }}

{{- if not (.Values.prometheusRules.disabled.NodeVirtualInterfaceDown | default false) }}
- alert: NodeVirtualInterfaceDown
expr: sum(node_network_up{device=~"bond.*|vlan.*"} == 0) by (node, device)
for: {{ dig "NodeVirtualInterfaceDown" "for" "15m" .Values.prometheusRules }}
labels:
severity: {{ dig "NodeVirtualInterfaceDown" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeVirtualInterfaceDown.md
{{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }}
annotations:
description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node.
summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded.
{{- end }}
15 changes: 15 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-node.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
groups:
- name: controlplane-node
rules:
{{- if not (.Values.prometheusRules.disabled.KubernetesNodeBridgeFilterVLANTagged | default false) }}
- alert: KubernetesNodeBridgeFilterVLANTagged
expr: kube_node_status_condition{condition="BridgeFilterVLANTagged", status="true"} == 1
for: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "for" "15m" .Values.prometheusRules }}
labels:
severity: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeBridgeFilterVLANTagged.md
{{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }}
annotations:
description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened!
summary: Bridged VLAN-tagged traffic is filtered by IPtables.
{{- end }}
33 changes: 33 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
groups:
- name: controlplane-pvc
rules:
- record: kubelet_volume_stats_available_percent
expr: (min by (persistentvolumeclaim, namespace) (100 * kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes))

- name: controlplane.pvc.alerts
rules:
{{- if not (.Values.prometheusRules.disabled.KubernetesPVCNoSpaceLeft | default false) }}
- alert: KubernetesPVCNoSpaceLeft
expr: kubelet_volume_stats_available_percent < 10
for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }}
labels:
severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md
{{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }}
annotations:
description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files."
summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%."
{{- end }}

{{- if not (.Values.prometheusRules.disabled.KubernetesPVCNoSpaceLeft | default false) }}
- alert: KubernetesPVCNoSpaceLeft
expr: kubelet_volume_stats_available_percent < 2
for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }}
labels:
severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md
{{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }}
annotations:
description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage."
summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%."
{{- end }}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading