Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.1.5
version: 1.1.6
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
74 changes: 74 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-backup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
groups:
- name: controlplane-backup
rules:
{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }}
Comment thread
videlov marked this conversation as resolved.
Outdated
- alert: EtcdKCPBackupCrashLoopBackOff
expr: >
max_over_time(
kube_pod_container_status_waiting_reason{
pod=~"etcd-kcp-backup-[a-z0-9]{8,10}-[a-z0-9]{5}",
reason="CrashLoopBackOff"
}[5m]
) == 1
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "EtcdKCPBackupCrashLoopBackOff" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupCrashLoopBackOff.md
service: {{ dig "EtcdKCPBackupCrashLoopBackOff" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "EtcdKCPBackupCrashLoopBackOff" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff. Check pod logs and events for more details.
summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is in CrashLoopBackOff.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupRestartRate | default false) }}
- alert: EtcdKCPBackupRestartRate
expr: >
rate(kube_pod_container_status_restarts_total{pod=~"etcd-kcp-backup-[a-z0-9]{8,10}-[a-z0-9]{5}"}[15m]) > 0
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "EtcdKCPBackupRestartRate" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupRestartRate.md
service: {{ dig "EtcdKCPBackupRestartRate" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "EtcdKCPBackupRestartRate" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently. Check pod logs and events for more details.
summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} on cluster {{`{{ $labels.cluster }}`}} is restarting frequently.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupFailure | default false) }}
- alert: EtcdKCPBackupFailure
expr: >
etcdbr_snapshot_duration_seconds_count{kind="Full",succeeded="false"} == 1
Comment thread
videlov marked this conversation as resolved.
Outdated
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "EtcdKCPBackupFailure" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupFailure.md
service: {{ dig "EtcdKCPBackupFailure" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "EtcdKCPBackupFailure" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot. Check pod logs and events for more details.
summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} failed full snapshot.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.EtcdKCPBackupSnapshotTooOld | default false) }}
- alert: EtcdKCPBackupSnapshotTooOld
expr: >
( # Alert if the latest full snapshot is older than 2 hours on more than 2 backup pods (we have 3 backup pods, so we alert if 2 or more are affected)
(time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 2 * 3600
)
AND
on()
count(
(time() - etcdbr_snapshot_latest_timestamp{kind="Full"}) > 2 * 3600
) > 2
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "EtcdKCPBackupSnapshotTooOld" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/EtcdKCPBackupSnapshotTooOld.md
service: {{ dig "EtcdKCPBackupSnapshotTooOld" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "EtcdKCPBackupSnapshotTooOld" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot. Check pod logs and events for more details.
summary: etcd KCP backup Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} with instance IP {{`{{ $labels.instance }}`}} has an outdated full snapshot.
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ groups:
description: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready. Check node conditions and events for more details.
summary: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not Ready.
{{- end }}

4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.1.5
version: 1.1.6
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.1.5
version: 1.1.6
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
Loading