diff --git a/.github/configs/helm-lint.yaml b/.github/configs/helm-lint.yaml new file mode 100644 index 0000000..bf2a450 --- /dev/null +++ b/.github/configs/helm-lint.yaml @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors +# SPDX-License-Identifier: Apache-2.0 + +# See https://github.com/helm/chart-testing#configuration +remote: origin +target-branch: main +validate-maintainers: false +check-version-increment: false +chart-dirs: + - charts diff --git a/.github/licenserc.yaml b/.github/licenserc.yaml index 2d24f56..2c61994 100644 --- a/.github/licenserc.yaml +++ b/.github/licenserc.yaml @@ -2,8 +2,11 @@ header: license: spdx-id: Apache-2.0 content: | - SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors + SPDX-FileCopyrightText: [year] SAP SE or an SAP affiliate company and Greenhouse contributors SPDX-License-Identifier: Apache-2.0 + pattern: | + SPDX-FileCopyrightText: [0-9]+ SAP SE or an SAP affiliate company and Greenhouse contributors + SPDX-License-Identifier: Apache-2\.0 paths: # `paths` are the path list that will be checked (and fixed) by license-eye, default is ['**']. - '**' @@ -12,34 +15,17 @@ header: - '.github/**' - '.reuse/dep5' - 'LICENSES/*.txt' - - 'grafana/*.json' - - 'website/scripts/**' - - 'hack/**' - - 'hack/openapi-generator/openapi-info.yaml' - - 'hack/boilerplate.go.txt' - - 'docs/**' - '**/*.md' - 'LICENSE' - - 'NOTICE' - - 'PROJECT' - '**/*.gitignore' - '**/*.helmignore' - '**/*.tpl' - - '**/go.mod' - - '**/go.sum' - - '**/*.lock' - - '**/*.json' - '**/.gitkeep' - '**/*.txt' - '*Dockerfile*' - 'Makefile' - - 'pkg/idproxy/web/**' - - 'pkg/apis/scheme_builder.go' # Belongs to the Kubernetes authors - - 'cmd/tcp-proxy/main.go' # MIT License - - 'pkg/tcp-proxy/proxy/*.go' # MIT License - - '**/zz_generated.deepcopy.go' # Generated by Kubebuilder - - 'charts/**/templates/*.yaml' # license headers on helm templates are causing issues - + - 'README.md.gotmpl' + - 'charts/**/**/*.yaml' # license headers on helm templates are causing issues comment: on-failure diff --git a/.github/workflows/codeql.yaml b/.github/workflows/codeql.yaml deleted file mode 100644 index 44f1021..0000000 --- a/.github/workflows/codeql.yaml +++ /dev/null @@ -1,27 +0,0 @@ -name: CodeQL -on: - push: - branches: [ "main" ] - pull_request: - types: [ opened, synchronize, reopened ] - schedule: - - cron: '20 08 * * 1' - -jobs: - codeql: - permissions: - security-events: write - actions: read - contents: read - uses: cloudoperators/common/.github/workflows/shared-codeql.yaml@main - with: - runs-on: "['default']" - language: "['go']" - go-check: true - go-version: "['1.23']" - node-check: false - # node-version : "['node']" - # fail-fast: false - # timeout: 30 - autobuild: true - # build_query: "make something" diff --git a/.github/workflows/helm-lint.yaml b/.github/workflows/helm-lint.yaml new file mode 100644 index 0000000..2ecd418 --- /dev/null +++ b/.github/workflows/helm-lint.yaml @@ -0,0 +1,66 @@ +name: "Helm lint and tests" +on: + pull_request: + types: [ opened, synchronize, reopened ] + +env: + REGISTRY: ghcr.io + +jobs: + helm-lint-test: + runs-on: [ default ] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Helm + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 + + - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5 + with: + python-version: 3.9 + check-latest: true + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up chart-linting and chart-testing + uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0 + + - name: Run chart-linting (list-changed) + id: list-changed + run: | + changed=$(ct list-changed --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }}) + if [[ -n "$changed" ]]; then + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Run chart-linting + if: steps.list-changed.outputs.changed == 'true' + run: ct lint --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }} + + - name: Check version bump + id: check-bump + if: steps.list-changed.outputs.changed == 'true' + continue-on-error: true + run: | + for chart in $(ct list-changed --config .github/configs/helm-lint.yaml --target-branch ${{ github.event.repository.default_branch }}); do + chart_version=$(yq .version "$chart/Chart.yaml") + if helm pull "oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/$(dirname $chart)" --version $chart_version; then + echo "chart=$(dirname $chart)" >> "$GITHUB_OUTPUT" + echo "chart_version=${chart_version}" >> "$GITHUB_OUTPUT" + echo "needsbump=true" >> "$GITHUB_OUTPUT" + fi + done + + - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7 + if: steps.check-bump.outputs.needsbump == 'true' + with: + script: | + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: ':warning: Chart `oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/${{ steps.check-bump.outputs.chart }}:${{ steps.check-bump.outputs.chart_version }}` already exists in OCI registry. Please increment the chart version.' + }) + core.setFailed(`Action failed with error: Chart version bump required`); diff --git a/.github/workflows/helm-release.yaml b/.github/workflows/helm-release.yaml new file mode 100644 index 0000000..a5751c3 --- /dev/null +++ b/.github/workflows/helm-release.yaml @@ -0,0 +1,99 @@ +name: Package Helm Chart and publish to GitHub Packages + +on: + workflow_dispatch: {} + push: + branches: + - main + paths: + - .github/workflows/helm-release.yaml + - charts/** + +permissions: + contents: write + packages: write + +env: + REGISTRY: ghcr.io + ACTIONS_RUNNER_DEBUG: false + +jobs: + helm-release: + runs-on: [ default ] + strategy: + fail-fast: false + matrix: + include: + - chartDir: charts/controlplane-operations + chartName: controlplane-operations + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Set up Helm + uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0 + + - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5 + with: + python-version: 3.9 + check-latest: true + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Log into registry ${{ env.REGISTRY }} + if: github.event_name != 'pull_request' + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@2f7c5bfce28377bc069a65ba478de0a74aa0ca32 # v46 + with: + files: | + ${{ matrix.chartDir }}/** + + - name: Check if Helm chart with same version already exists + id: check-chart + if: steps.changed-files.outputs.all_changed_files != '' + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + run: | + # List all changed + echo "All changed files: $ALL_CHANGED_FILES" + chartName=$(yq .name "${{ matrix.chartDir }}/Chart.yaml") + chartVersion=$(yq .version "${{ matrix.chartDir }}/Chart.yaml") + echo "chart_version=${chartVersion}" >> "$GITHUB_OUTPUT" + if $(helm pull "oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/${{ matrix.chartName }}" --version $chartVersion); then + echo "bump=true" >> "$GITHUB_OUTPUT" + fi + + - name: Chart needs version bump + if: steps.check-chart.outputs.bump == 'true' + env: + CHART_VERSION: ${{ steps.check-chart.outputs.chart_version }} + run: | + echo "Chart ${{ matrix.chartDir }}:${{ env.CHART_VERSION }} already exists in OCI registry. Skipping upload. Please increment the chart version." + exit 1 + + - name: Push Charts to GHCR + if: steps.changed-files.outputs.all_changed_files != '' && steps.check-chart.outputs.bump != 'true' + run: | + helm package ${{ matrix.chartDir }} -d ${{ matrix.chartDir }} + PKG_NAME=`ls ${{ matrix.chartDir }}/*.tgz` + helm push ${PKG_NAME} oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/ + + - name: Run chart-releaser + uses: helm/chart-releaser-action@cae68fefc6b5f367a0275617c9f83181ba54714f # v1.7.0 + env: + CR_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CR_GENERATE_RELEASE_NOTES: true diff --git a/README.md b/README.md index 67bc06a..2ddaffb 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,29 @@ A set of Perses dashboards and Prometheus alert rules combined with playbooks to ensure effective operation of Controlplane clusters within the ApeiroRA stack. +# Content + +The content is structured as follows: + +``` +controlplane-operations + │ + ├── playbooks/ Step-by-step instructions for troubleshooting. + │ + └── charts/ + │ + └── controlplane-operations + │ + ├── alerts Prometheus alerts for kubernetes. + │ + ├── dashboards Perses dashboards for visualizing key metrics. + │ + └── Chart.yaml Helm chart manifest. +``` + ## Requirements and Setup -*Insert a short description what is required to get your project running...* +The content of the repository can be installed as a [Greenhouse](https://github.com/cloudoperators/greenhouse) Plugin. ## Support, Feedback, Contributing diff --git a/REUSE.toml b/REUSE.toml index a042536..74ec39c 100644 --- a/REUSE.toml +++ b/REUSE.toml @@ -1,2 +1,14 @@ -# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +# SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and Greenhouse contributors +# SPDX-License-Identifier: Apache-2.0 + +version = 1 +SPDX-PackageName = "controlplane-operations" +SPDX-PackageSupplier = "Vladimir Videlov " +SPDX-PackageDownloadLocation = "https://github.com/cloudoperators/controlplane-operations" +SPDX-PackageComment = "The code in this project may include calls to APIs (\"API Calls\") of\n SAP or third-party products or services developed outside of this project\n (\"External Products\").\n \"APIs\" means application programming interfaces, as well as their respective\n specifications and implementing code that allows software to communicate with\n other software.\n API Calls to External Products are not licensed under the open source license\n that governs this project. The use of such API Calls and related External\n Products are subject to applicable additional agreements with the relevant\n provider of the External Products. In no event shall the open source license\n that governs this project grant any rights in or to any External Products,or\n alter, expand or supersede any terms of the applicable additional agreements.\n If you have a valid license agreement with SAP for the use of a particular SAP\n External Product, then you may make use of any API Calls included in this\n project's code for that SAP External Product, subject to the terms of such\n license agreement. If you do not have a valid license agreement for the use of\n a particular SAP External Product, then you may only make use of any API Calls\n in this project for that SAP External Product for your internal, non-productive\n and non-commercial test and evaluation of such API Calls. Nothing herein grants\n you any rights to use or access any SAP External Product, or provide any third\n parties the right to use of access any SAP External Product, through API Calls." + +[[annotations]] +path = "**" +precedence = "aggregate" +SPDX-FileCopyrightText = "2025 SAP SE or an SAP affiliate company and Greenhouse contributors" +SPDX-License-Identifier = "Apache-2.0" diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml new file mode 100644 index 0000000..ce16259 --- /dev/null +++ b/charts/controlplane-operations/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +name: controlplane-operations +version: 1.0.1 +description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. +maintainers: + - name: Vladimir Videlov (d051408) + email: vladimir.videlov@sap.com +keywords: + - Helm Chart + - Controlplane operations + - Plutono Dashboards + - Prometheus Alerting + - Alert Rules + - Playbooks diff --git a/charts/controlplane-operations/alerts/controlplane-bond.yaml b/charts/controlplane-operations/alerts/controlplane-bond.yaml new file mode 100644 index 0000000..4afb2a7 --- /dev/null +++ b/charts/controlplane-operations/alerts/controlplane-bond.yaml @@ -0,0 +1,28 @@ +groups: +- name: controlplane-bond + rules: +{{- if not (.Values.prometheusRules.disabled.NodeBondDegradedMain | default false) }} + - alert: NodeBondDegradedMain + expr: sum(node_bonding_active) by (master, node) < 2 + for: {{ dig "NodeBondDegradedMain" "for" "15m" .Values.prometheusRules }} + labels: + severity: {{ dig "NodeBondDegradedMain" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeBondDegradedMain.md + {{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }} + annotations: + description: Bond `{{`{{ $labels.master }}`}}` on `{{`{{ $labels.node }}`}}` is degraded. Imminent network outage for this node. + summary: Bond `{{`{{ $labels.master }}`}}` is degraded. Node network connectivity is not HA. Switch failover or upgrade will cause an outage! +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.NodeVirtualInterfaceDown | default false) }} + - alert: NodeVirtualInterfaceDown + expr: sum(node_network_up{device=~"bond.*|vlan.*"} == 0) by (node, device) + for: {{ dig "NodeVirtualInterfaceDown" "for" "15m" .Values.prometheusRules }} + labels: + severity: {{ dig "NodeVirtualInterfaceDown" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/NodeVirtualInterfaceDown.md + {{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }} + annotations: + description: Interface `{{`{{ $labels.device }}`}}` on `{{`{{ $labels.node }}`}}` is down. Tenant network outage for this node. + summary: Interface `{{`{{ $labels.device }}`}}` is down. Node network connectivity is degraded. +{{- end }} diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml new file mode 100644 index 0000000..70b5b89 --- /dev/null +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -0,0 +1,15 @@ +groups: +- name: controlplane-node + rules: +{{- if not (.Values.prometheusRules.disabled.KubernetesNodeBridgeFilterVLANTagged | default false) }} + - alert: KubernetesNodeBridgeFilterVLANTagged + expr: kube_node_status_condition{condition="BridgeFilterVLANTagged", status="true"} == 1 + for: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "for" "15m" .Values.prometheusRules }} + labels: + severity: {{ dig "KubernetesNodeBridgeFilterVLANTagged" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeBridgeFilterVLANTagged.md + {{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }} + annotations: + description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened! + summary: Bridged VLAN-tagged traffic is filtered by IPtables. +{{- end }} diff --git a/charts/controlplane-operations/alerts/controlplane-pvc.yaml b/charts/controlplane-operations/alerts/controlplane-pvc.yaml new file mode 100644 index 0000000..8646352 --- /dev/null +++ b/charts/controlplane-operations/alerts/controlplane-pvc.yaml @@ -0,0 +1,33 @@ +groups: +- name: controlplane-pvc + rules: + - record: kubelet_volume_stats_available_percent + expr: (min by (persistentvolumeclaim, namespace) (100 * kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes)) + +- name: controlplane.pvc.alerts + rules: +{{- if not (.Values.prometheusRules.disabled.KubernetesPVCNoSpaceLeft | default false) }} + - alert: KubernetesPVCNoSpaceLeft + expr: kubelet_volume_stats_available_percent < 10 + for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }} + labels: + severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md + {{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }} + annotations: + description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is almost full. Increase or delete files." + summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` free space is less than 10%." +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.KubernetesPVCNoSpaceLeft | default false) }} + - alert: KubernetesPVCNoSpaceLeft + expr: kubelet_volume_stats_available_percent < 2 + for: {{ dig "KubernetesPVCNoSpaceLeft" "for" "10m" .Values.prometheusRules }} + labels: + severity: {{ dig "KubernetesPVCNoSpaceLeft" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesPVCNoSpaceLeft.md + {{ include "controlplane-operations.additionalRuleLabels" . | nindent 6 }} + annotations: + description: "The PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` is full. Programs will stop working if relying upon free storage." + summary: "PVC `{{`{{ $labels.namespace }}`}}`/`{{`{{ $labels.persistentvolumeclaim }}`}}` usage is over 98%." +{{- end }} diff --git a/charts/controlplane-operations/kubernetes-logo.png b/charts/controlplane-operations/kubernetes-logo.png new file mode 100644 index 0000000..f2712c7 Binary files /dev/null and b/charts/controlplane-operations/kubernetes-logo.png differ diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml new file mode 100644 index 0000000..c0d6a36 --- /dev/null +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -0,0 +1,37 @@ +apiVersion: greenhouse.sap/v1alpha1 +kind: PluginDefinition +metadata: + name: controlplane-operations +spec: + version: 1.0.1 + displayName: Controlplane operations bundle + description: Operations bundle for Controlane clusters + docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md + icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png + helmChart: + name: controlplane-operations + repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts + version: 1.0.1 + options: + - name: prometheusRules.create + description: Create Prometheus rules + required: false + default: true + type: bool + - name: prometheusRules.ruleSelector + description: Selector for Prometheus rules to be picked up by the Prometheus operator. List of key-value pairs. + required: false + type: list + - name: prometheusRules.additionalRuleLabels + description: Additional labels to be added every Prometheus rule. E.g support_group, service + required: false + type: map + - name: dashboards.create + description: Create Plutono dashboards + required: false + default: false + type: bool + - name: dashboards.dashboardSelector + description: Selector for dashboards to be picked up by the Plutono. List of key-value pairs. + required: false + type: list diff --git a/charts/controlplane-operations/templates/_helpers.tpl b/charts/controlplane-operations/templates/_helpers.tpl new file mode 100644 index 0000000..0975082 --- /dev/null +++ b/charts/controlplane-operations/templates/_helpers.tpl @@ -0,0 +1,41 @@ +{{/* Generate basic labels */}} +{{- define "controlplane-operations.labels" }} +{{- $path := index . 0 -}} +{{- $root := index . 1 -}} +app.kubernetes.io/version: {{ $root.Chart.Version }} +app.kubernetes.io/part-of: {{ $root.Release.Name }} +{{- if $root.Values.global.commonLabels}} +{{ toYaml $root.Values.global.commonLabels }} +{{- end }} +{{- end }} + +{{- define "controlplane-operations.ruleSelectorLabels" }} +{{- $path := index . 0 -}} +{{- $root := index . 1 -}} +plugin: {{ $root.Release.Name }} +{{- if $root.Values.prometheusRules.ruleSelectors }} +{{- range $i, $target := $root.Values.prometheusRules.ruleSelectors }} +{{ $target.name | required (printf "$.Values.prometheusRules.ruleSelector.[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.prometheusRules.ruleSelector.[%v].value missing" $i)) $root }} +{{- end }} +{{- end }} +{{- end }} + +{{- define "controlplane-operations.additionalRuleLabels" }} +{{- if .Values.prometheusRules.additionalRuleLabels }} + {{- toYaml .Values.prometheusRules.additionalRuleLabels | nindent 6 }} +{{- end }} +{{- if .Values.global.commonLabels }} +{{ tpl (toYaml .Values.global.commonLabels) . }} +{{- end }} +{{- end }} + +{{- define "controlplane-operations.dashboardSelectorLabels" }} +{{- $path := index . 0 -}} +{{- $root := index . 1 -}} +plugin: {{ $root.Release.Name }} +{{- if $root.Values.dashboards.persesSelectors }} +{{- range $i, $target := $root.Values.dashboards.persesSelectors }} +{{ $target.name | required (printf "$.Values.dashboards.persesSelectors.[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.dashboards.persesSelectors.[%v].value missing" $i)) $ }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/controlplane-operations/templates/alerts.yaml b/charts/controlplane-operations/templates/alerts.yaml new file mode 100644 index 0000000..7d14ddf --- /dev/null +++ b/charts/controlplane-operations/templates/alerts.yaml @@ -0,0 +1,25 @@ +{{- if .Values.prometheusRules.create -}} +{{- $root := . -}} +{{- range $path, $bytes := .Files.Glob "alerts/*.yaml" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" $.Release.Name $path | replace "/" "-" | trimSuffix ".yaml" | trunc 63 }} + labels: +{{ include "controlplane-operations.ruleSelectorLabels" (list $path $root) | indent 4 }} +{{ include "controlplane-operations.labels" (list $path $root) | indent 4 }} +{{- if $.Values.prometheusRules.labels }} +{{ toYaml $.Values.prometheusRules.labels | indent 4 }} +{{- end }} +{{- if $.Values.prometheusRules.annotations }} + annotations: +{{ toYaml $.Values.prometheusRules.annotations | indent 4 }} +{{- end }} +spec: +{{- with $root -}} +{{- $content := printf "%s" $bytes }} +{{ tpl $content . | indent 2 }} +{{- end }} +--- +{{- end }} +{{- end }} diff --git a/charts/controlplane-operations/templates/dashboards.yaml b/charts/controlplane-operations/templates/dashboards.yaml new file mode 100644 index 0000000..1ca318e --- /dev/null +++ b/charts/controlplane-operations/templates/dashboards.yaml @@ -0,0 +1,16 @@ +{{- if .Values.dashboards.create }} +{{ $root := . }} +{{- range $path, $bytes := .Files.Glob "perses-dashboards/*.json" }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" $root.Release.Name ($path | replace ".json" "" | replace "/" "-" | trunc 63) }} + labels: +{{ include "controlplane-operations.persesDashboardSelectorLabels" (list $path $root) | indent 4 }} +{{ include "controlplane-operations.labels" (list $path $root) | indent 4 }} +data: +{{ printf "%s: |-" ($path | replace "/" "-" | indent 2) }} +{{ printf "%s" $bytes | indent 4 }} +{{- end }} +{{- end }} diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml new file mode 100644 index 0000000..3b01116 --- /dev/null +++ b/charts/controlplane-operations/values.yaml @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Greenhouse contributors +# SPDX-License-Identifier: Apache-2.0 + +global: + ## Common labels to add to all resources + ## + commonLabels: {} + +## C default rules for monitoring the cluster +## +prometheusRules: + + ## Enables PrometheusRule resources to be created + create: true + + ## Label selectors for the Prometheus rules to be picked up by Prometheus. + ruleSelectors: + # - name: plugin + # value: kube-monitoring + # - name: prometheus + # value: kubernetes + + ## Labels for PrometheusRules + labels: {} + + ## Annotations for PrometheusRules + annotations: {} + + ## Additional labels for PrometheusRule alerts + ## This is useful for adding additional labels such as "support_group" or "service" for the routing of alerts to each rule + additionalRuleLabels: + # support_group: support + # service: my-service + + ## Additional annotations for PrometheusRule alerts + additionalRuleAnnotations: {} + + ## Disabled PrometheusRule alerts + disabled: {} + # KubernetesApiServerDown: true + # KubeletDown: true + + +## Create default dashboards for monitoring the cluster +## +dashboards: + + ## Enables ConfigMap resources with dashboards to be created + create: false + + # -- Label selectors for the Perses dashboards to be picked up by Perses. + persesSelectors: + - name: perses.dev/resource + value: '"true"' diff --git a/playbooks/NodeBondDegradedMain.md b/playbooks/NodeBondDegradedMain.md new file mode 100644 index 0000000..688e4aa --- /dev/null +++ b/playbooks/NodeBondDegradedMain.md @@ -0,0 +1,82 @@ +--- +title: NodeBondDegradedMain +weight: 20 +--- + +# NodeBondDegradedMain + +## Problem + +The Kubernetes node is experiencing network bonding issues, which can lead to degraded performance or connectivity problems. This may manifest as slow network speeds, intermittent connectivity, or complete loss of network access for the node. + +## Impact + +The impact of this issue can vary depending on the severity of the network bonding problem. In a worst-case scenario, the node may become completely unreachable, leading to downtime for any applications or services running on that node. Even if the node remains reachable, performance degradation can lead to slow response times and increased latency for users and applications. + +## Diagnosis + +1. **Check Node Status**: Use the `kubectl get nodes` command to check the status of the node. Look for any nodes that are not in the "Ready" state. + ```bash + kubectl get nodes + ``` +2. **Check Network Interfaces**: Use the `ip addr` command to check the status of the network interfaces on the node. Look for any interfaces that are down or not configured correctly. + ```bash + ip addr + ``` +3. **Check Bonding Configuration**: Use the `cat /proc/net/bonding/bond0` command (or the appropriate bond interface) to check the bonding configuration. Look for any errors or misconfigurations in the bonding mode or slave interfaces. + ```bash + cat /proc/net/bonding/bond0 + ``` +4. **Check Logs**: Check the system logs for any errors related to network bonding. Use the `dmesg` command or check the `/var/log/syslog` or `/var/log/messages` files. + ```bash + dmesg | grep bonding + ``` + ```bash + tail -n 100 /var/log/syslog | grep bonding + ``` +5. **Check Network Configuration**: Review the network configuration files (e.g., `/etc/network/interfaces` or `/etc/sysconfig/network-scripts/ifcfg-*`) to ensure that the bonding configuration is correct and matches the expected setup. +6. **Check Network Connectivity**: Use the `ping` command to test connectivity to other nodes or external resources. This can help identify if the issue is isolated to the node or if it affects the entire network. + ```bash + ping + ``` +7. **Check Firewall Rules**: Ensure that there are no firewall rules blocking traffic to or from the node. Use the `iptables` command to check the current rules. + ```bash + iptables -L -n + ``` +8. **Check Network Performance**: Use tools like `iperf` or `netstat` to check network performance and identify any bottlenecks or issues with the network interfaces. + ```bash + iperf -c + ``` + ```bash + netstat -i + ``` +9. **Check Kubernetes Events**: Use the `kubectl describe node ` command to check for any events related to the node that may indicate network issues. + ```bash + kubectl describe node + ``` +10. **Check CNI Plugin**: If using a Container Network Interface (CNI) plugin, check the plugin's logs and configuration to ensure it is functioning correctly and not causing network issues. + + +## Resolution steps +1. **Restart Network Services**: Restart the network services on the node to reinitialize the network interfaces and bonding configuration. + ```bash + sudo systemctl restart networking + ``` +2. **Reconfigure Bonding**: If the bonding configuration is incorrect, reconfigure it according to the desired setup. This may involve editing the network configuration files and restarting the network services. +3. **Check Hardware**: If the issue persists, check the physical network interfaces and cables for any hardware issues. This may involve reseating cables or replacing faulty hardware. +4. **Update Drivers**: Ensure that the network drivers are up to date. This may involve updating the kernel or installing new drivers for the network interfaces. +5. **Reboot Node**: If all else fails, consider rebooting the node to reset the network stack and reinitialize the bonding configuration. + ```bash + sudo reboot + ``` +6. **Monitor Network Performance**: After resolving the issue, monitor the network performance to ensure that the problem does not recur. Use tools like `iftop` or `nload` to monitor network traffic and performance. + ```bash + iftop -i + ``` + ```bash + nload + ``` +7. **Document Changes**: Document any changes made to the network configuration or bonding setup for future reference. This can help in troubleshooting similar issues in the future. +8. **Notify Stakeholders**: If the issue caused downtime or performance degradation, notify stakeholders and users of the resolution and any potential impact on services. +9. **Review and Update Procedures**: Review the incident and update any procedures or documentation to prevent similar issues in the future. This may involve updating network configuration templates or improving monitoring and alerting for network issues. +