diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index 297446b..ca328a4 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.1.5 +version: 1.1.6 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-backup.yaml b/charts/controlplane-operations/alerts/controlplane-backup.yaml new file mode 100644 index 0000000..c40aa86 --- /dev/null +++ b/charts/controlplane-operations/alerts/controlplane-backup.yaml @@ -0,0 +1,74 @@ +groups: +- name: controlplane-backup + rules: +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupCrashLoopBackOff | default false) }} + - alert: EtcdKCPBackupCrashLoopBackOff + expr: > + max_over_time( + kube_pod_container_status_waiting_reason{ + pod=~"etcd-kcp-backup-[a-z0-9]{8,10}-[a-z0-9]{5}", + reason="CrashLoopBackOff" + }[5m] + ) == 1 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupCrashLoopBackOff" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupCrashLoopBackOff.md + service: {{ dig "EtcdKCPBackupCrashLoopBackOff" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupCrashLoopBackOff" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} on {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }} + - alert: EtcdKCPBackupRestartRate + expr: > + rate(kube_pod_container_status_restarts_total{pod=~"etcd-kcp-backup-[a-z0-9]{8,10}-[a-z0-9]{5}"}[15m]) > 0 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupRestartRate" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupRestartRate.md + service: {{ dig "EtcdKCPBackupRestartRate" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupRestartRate" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} on {{`{{ $labels.cluster }}`}} is restarting frequently. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupFailure | default false) }} + - alert: EtcdKCPBackupFailure + expr: > + etcdbr_snapshot_duration_seconds_count{kind="Full",succeeded="false"} > 0 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupFailure" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupFailure.md + service: {{ dig "EtcdKCPBackupFailure" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupFailure" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} failed full snapshot. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupSnapshotTooOld | default false) }} + - alert: EtcdKCPBackupSnapshotTooOld + expr: > + ( # Alert if the latest full snapshot is older than 2 hours on more than 2 backup pods (we have 3 backup pods, so we alert if 2 or more are affected) + (time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 2 * 3600 + ) + AND + on() + count( + (time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 2 * 3600 + ) > 2 + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "EtcdKCPBackupSnapshotTooOld" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupSnapshotTooOld.md + service: {{ dig "EtcdKCPBackupSnapshotTooOld" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "EtcdKCPBackupSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. Check pod logs and events for more details. + summary: etcd KCP backup Pod {{`{{ $labels.pod }}`}} has an outdated full snapshot. +{{- end }} diff --git a/charts/controlplane-operations/alerts/controlplane-node.yaml b/charts/controlplane-operations/alerts/controlplane-node.yaml index 49c85f7..2ecbbce 100644 --- a/charts/controlplane-operations/alerts/controlplane-node.yaml +++ b/charts/controlplane-operations/alerts/controlplane-node.yaml @@ -30,3 +30,4 @@ groups: description: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. Check node conditions and events for more details. summary: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. {{- end }} + diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 8b9a4c7..69fbde4 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.1.5 + version: 1.1.6 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.1.5 + version: 1.1.6 options: - name: prometheusRules.create description: Create Prometheus rules