Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.0.29
version: 1.1.0
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
92 changes: 68 additions & 24 deletions charts/controlplane-operations/alerts/controlplane-gardener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,51 +4,95 @@ groups:

### Gardener ###

{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }}
- alert: ShootUnavailability
expr: shoot:availability{instance=~"https:\\/\\/api\\..+",container=""} == 0
for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }}
{{- if not (.Values.prometheusRules.disabled.ShootReconciliationFailed | default false) }}
- alert: ShootReconciliationFailed
expr: min_over_time((garden_shoot_operation_states{operation="Reconcile"} != 1 or garden_shoot_operation_states{operation="Reconcile"} != 2)[{{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }}:]) != 1
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md
service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
severity: {{ dig "ShootReconciliationFailed" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootReconciliationFailed.md
service: gardener
support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }}
annotations:
description: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. Need manual investigation of the root cause. Check the shoot and underlying infrastructure for issues.
summary: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable.
description: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully for {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} minutes. Check the shoot's conditions and events for more details.
summary: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.ShootConditionNotTrue | default false) }}
- alert: ShootConditionNotTrue
expr: max by(project,landscape,seed,name) (garden_shoot_condition != 1)
for: {{ dig "ShootConditionNotTrue" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ShootConditionNotTrue" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootConditionNotTrue.md
service: gardener
support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }}
annotations:
description: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. Check the Shoot's conditions and events for more details.
summary: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.SeedConditionNotTrue | default false) }}
- alert: SeedConditionNotTrue
expr: max by(landscape,name) (garden_seed_condition != 1)
for: {{ dig "SeedConditionNotTrue" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "SeedConditionNotTrue" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/SeedConditionNotTrue.md
service: gardener
support_group: {{ dig "SeedConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True. Check the Seed's conditions and events for more details.
summary: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.GardenletNotReady | default false) }}
- alert: GardenletNotReady
expr: max by(landscape) (garden_garden_condition{status="True"}) != 1
for: {{ dig "GardenletNotReady" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "GardenletNotReady" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/GardenletNotReady.md
service: gardener
support_group: {{ dig "GardenletNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. Check the Gardenlet's conditions and events for more details.
summary: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready.
{{- end }}

### Calico ###

{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionDown | default false) }}
- alert: CalicoBgpNeighborSessionDown
expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }}
expr: sum by (cluster,node,pod) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }}
for: {{ dig "CalicoBgpNeighborSessionDown" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "CalicoBgpNeighborSessionDown" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionDown.md
service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
service: calico
support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
summary: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }}
- alert: CalicoBgpNeighborSessionAllDown
expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) == 0
expr: sum by (cluster,node,pod) (bird_protocol_up{proto="BGP",state="Established"}) == 0
for: {{ dig "CalicoBgpNeighborSessionAllDown" "for" "10m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "CalicoBgpNeighborSessionAllDown" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionAllDown.md
service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
service: calico
support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Node {{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
summary: Node {{`{{ $labels.node }}`}} has no BGP neighbors.
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }}
Expand All @@ -59,11 +103,11 @@ groups:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "CalicoNodeMissing" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeMissing.md
service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
service: calico
support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Calico is not running on all bare metal nodes. Network datapath threatened!
summary: Calico is not running on all nodes.
description: Calico is not running on all BareMetal nodes. Network datapath threatened!
summary: Calico is not running on all BareMetal nodes.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }}
Expand All @@ -74,9 +118,9 @@ groups:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "CalicoNodeNotReady" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeNotReady.md
service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
service: calico
support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened!
summary: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened!
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
{{- end }}
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.0.29
version: 1.1.0
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.0.29
version: 1.1.0
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
10 changes: 10 additions & 0 deletions charts/controlplane-operations/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,13 @@ plugin: {{ $root.Release.Name }}
{{- end }}
{{- end }}
{{- end }}

{{- define "controlplane-operations.supportGroup" }}
{{- if eq "cp" . }}
{{- "containers" }}
{{- else if eq "compute" . }}
{{- "compute" }}
{{- else if eq "storage" . }}
{{- "storage" }}
{{- end }}
{{- end }}
Loading