diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index df16ecf..4d5ed48 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.29 +version: 1.1.0 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 67374d1..8dc469a 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -4,51 +4,95 @@ groups: ### Gardener ### -{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} - - alert: ShootUnavailability - expr: shoot:availability{instance=~"https:\\/\\/api\\..+",container=""} == 0 - for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }} +{{- if not (.Values.prometheusRules.disabled.ShootReconciliationFailed | default false) }} + - alert: ShootReconciliationFailed + expr: min_over_time((garden_shoot_operation_states{operation="Reconcile"} != 1 or garden_shoot_operation_states{operation="Reconcile"} != 2)[{{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }}:]) != 1 labels: {{ include "controlplane-operations.additionalRuleLabels" . }} - severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }} - playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md - service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} - support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + severity: {{ dig "ShootReconciliationFailed" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootReconciliationFailed.md + service: gardener + support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }} annotations: - description: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. Need manual investigation of the root cause. Check the shoot and underlying infrastructure for issues. - summary: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. + description: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully for {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} minutes. Check the shoot's conditions and events for more details. + summary: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.ShootConditionNotTrue | default false) }} + - alert: ShootConditionNotTrue + expr: max by(project,landscape,seed,name) (garden_shoot_condition != 1) + for: {{ dig "ShootConditionNotTrue" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "ShootConditionNotTrue" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootConditionNotTrue.md + service: gardener + support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }} + annotations: + description: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. Check the Shoot's conditions and events for more details. + summary: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.SeedConditionNotTrue | default false) }} + - alert: SeedConditionNotTrue + expr: max by(landscape,name) (garden_seed_condition != 1) + for: {{ dig "SeedConditionNotTrue" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "SeedConditionNotTrue" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/SeedConditionNotTrue.md + service: gardener + support_group: {{ dig "SeedConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True. Check the Seed's conditions and events for more details. + summary: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.GardenletNotReady | default false) }} + - alert: GardenletNotReady + expr: max by(landscape) (garden_garden_condition{status="True"}) != 1 + for: {{ dig "GardenletNotReady" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "GardenletNotReady" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/GardenletNotReady.md + service: gardener + support_group: {{ dig "GardenletNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. Check the Gardenlet's conditions and events for more details. + summary: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. {{- end }} ### Calico ### {{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionDown | default false) }} - alert: CalicoBgpNeighborSessionDown - expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }} + expr: sum by (cluster,node,pod) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }} for: {{ dig "CalicoBgpNeighborSessionDown" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoBgpNeighborSessionDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionDown.md - service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? - summary: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. + description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? + summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }} - alert: CalicoBgpNeighborSessionAllDown - expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) == 0 + expr: sum by (cluster,node,pod) (bird_protocol_up{proto="BGP",state="Established"}) == 0 for: {{ dig "CalicoBgpNeighborSessionAllDown" "for" "10m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoBgpNeighborSessionAllDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionAllDown.md - service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Node {{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? - summary: Node {{`{{ $labels.node }}`}} has no BGP neighbors. + description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? + summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }} @@ -59,11 +103,11 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoNodeMissing" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeMissing.md - service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico is not running on all bare metal nodes. Network datapath threatened! - summary: Calico is not running on all nodes. + description: Calico is not running on all BareMetal nodes. Network datapath threatened! + summary: Calico is not running on all BareMetal nodes. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }} @@ -74,9 +118,9 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoNodeNotReady" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeNotReady.md - service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! - summary: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. + description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! + summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. {{- end }} diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 933779a..112faad 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.29 + version: 1.1.0 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.29 + version: 1.1.0 options: - name: prometheusRules.create description: Create Prometheus rules diff --git a/charts/controlplane-operations/templates/_helpers.tpl b/charts/controlplane-operations/templates/_helpers.tpl index 555a753..716cebd 100644 --- a/charts/controlplane-operations/templates/_helpers.tpl +++ b/charts/controlplane-operations/templates/_helpers.tpl @@ -38,3 +38,13 @@ plugin: {{ $root.Release.Name }} {{- end }} {{- end }} {{- end }} + +{{- define "controlplane-operations.supportGroup" }} +{{- if eq "cp" . }} +{{- "containers" }} +{{- else if eq "compute" . }} +{{- "compute" }} +{{- else if eq "storage" . }} +{{- "storage" }} +{{- end }} +{{- end }}