From b57fdcf219d4c94d2d2c83dd64913548eba9d4e7 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Thu, 2 Apr 2026 11:18:40 +0200 Subject: [PATCH 1/3] etcd backup pod alerts --- charts/controlplane-operations/Chart.yaml | 2 +- .../alerts/controlplane-backup.yaml | 37 +++++++++++++++++++ .../alerts/controlplane-node.yaml | 1 + .../plugindefinition.yaml | 4 +- 4 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 charts/controlplane-operations/alerts/controlplane-backup.yaml diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index 297446b..ca328a4 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.1.5 +version: 1.1.6 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-backup.yaml b/charts/controlplane-operations/alerts/controlplane-backup.yaml new file mode 100644 index 0000000..8c7a4c2 --- /dev/null +++ b/charts/controlplane-operations/alerts/controlplane-backup.yaml @@ -0,0 +1,37 @@ +groups: +- name: controlplane-backup + rules: +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }} + - alert: EtcdKCPBackupCrashLoopBackOff + expr: > + max_over_time( + kube_pod_container_status_waiting_reason{ + pod=~"etcd-kcp-backup-[a-z0-9]{8,10}-[a-z0-9]{5}", + reason="CrashLoopBackOff" + }[5m] + ) == 1 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupCrashLoopBackOff" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupCrashLoopBackOff.md + service: {{ dig "EtcdKCPBackupCrashLoopBackOff" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupCrashLoopBackOff" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }} + - alert: EtcdKCPBackupRestartRate + expr: > + rate(kube_pod_container_status_restarts_total{pod=~"etcd-kcp-backup-[a-z0-9]{8,10}-[a-z0-9]{5}"}[15m]) > 0 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupRestartRate" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupRestartRate.md + service: {{ dig "EtcdKCPBackupRestartRate" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupRestartRate" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. +{{- end }} diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml index 49c85f7..2ecbbce 100644 --- a/charts/controlplane-operations/alerts/controlplane-node.yaml +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -30,3 +30,4 @@ groups: description: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. Check node conditions and events for more details. summary: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. {{- end }} + diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 8b9a4c7..69fbde4 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.1.5 + version: 1.1.6 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.1.5 + version: 1.1.6 options: - name: prometheusRules.create description: Create Prometheus rules From 1f7d9a5751d8b15313cdd1302546274a4c509ede Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 7 Apr 2026 11:42:33 +0200 Subject: [PATCH 2/3] etcdbr snapshot failure or too old --- .../alerts/controlplane-backup.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/charts/controlplane-operations/alerts/controlplane-backup.yaml b/charts/controlplane-operations/alerts/controlplane-backup.yaml index 8c7a4c2..9ce609f 100644 --- a/charts/controlplane-operations/alerts/controlplane-backup.yaml +++ b/charts/controlplane-operations/alerts/controlplane-backup.yaml @@ -35,3 +35,40 @@ groups: description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. Check pod logs and events for more details. summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. {{- end }} + +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupFailure | default false) }} + - alert: EtcdKCPBackupFailure + expr: > + etcdbr_snapshot_duration_seconds_count{kind="Full",succeeded="false"} == 1 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupFailure" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupFailure.md + service: {{ dig "EtcdKCPBackupFailure" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupFailure" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupSnapshotTooOld | default false) }} + - alert: EtcdKCPBackupSnapshotTooOld + expr: > + ( # Alert if the latest full snapshot is older than 2 hours on more than 2 backup pods (we have 3 backup pods, so we alert if 2 or more are affected) + (time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 2 * 3600 + ) + AND + on() + count( + (time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 2 * 3600 + ) > 2 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupSnapshotTooOld" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupSnapshotTooOld.md + service: {{ dig "EtcdKCPBackupSnapshotTooOld" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. +{{- end }} From e4d308012a1ccef63d928c1029d1e55d691c7e61 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 7 Apr 2026 12:14:34 +0200 Subject: [PATCH 3/3] review comments --- .../alerts/controlplane-backup.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-backup.yaml b/charts/controlplane-operations/alerts/controlplane-backup.yaml index 9ce609f..c40aa86 100644 --- a/charts/controlplane-operations/alerts/controlplane-backup.yaml +++ b/charts/controlplane-operations/alerts/controlplane-backup.yaml @@ -1,7 +1,7 @@ groups: - name: controlplane-backup rules: -{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }} +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupCrashLoopBackOff | default false) }} - alert: EtcdKCPBackupCrashLoopBackOff expr: > max_over_time( @@ -18,7 +18,7 @@ groups: support_group: {{ dig "EtcdKCPBackupCrashLoopBackOff" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. Check pod logs and events for more details. - summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} on {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. {{- end }} {{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }} @@ -33,13 +33,13 @@ groups: support_group: {{ dig "EtcdKCPBackupRestartRate" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. Check pod logs and events for more details. - summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} on {{`{{ $labels.cluster }}`}} is restarting frequently. {{- end }} {{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupFailure | default false) }} - alert: EtcdKCPBackupFailure expr: > - etcdbr_snapshot_duration_seconds_count{kind="Full",succeeded="false"} == 1 + etcdbr_snapshot_duration_seconds_count{kind="Full",succeeded="false"} > 0 labels: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "EtcdKCPBackupFailure" "severity" "info" .Values.prometheusRules }} @@ -48,7 +48,7 @@ groups: support_group: {{ dig "EtcdKCPBackupFailure" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot. Check pod logs and events for more details. - summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} failed full snapshot. {{- end }} {{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupSnapshotTooOld | default false) }} @@ -70,5 +70,5 @@ groups: support_group: {{ dig "EtcdKCPBackupSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. Check pod logs and events for more details. - summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} has an outdated full snapshot. {{- end }}