From e1fed97470752783220b23f3dc7b5e1fa6327e3e Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Mon, 23 Mar 2026 16:20:52 +0100 Subject: [PATCH 1/6] Gardener alerts revisited --- .../alerts/controlplane-gardener.yaml | 50 +++++++++++++++---- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 67374d1..a9a8e78 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -4,19 +4,49 @@ groups: ### Gardener ### -{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} - - alert: ShootUnavailability - expr: shoot:availability{instance=~"https:\\/\\/api\\..+",container=""} == 0 - for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }} +{{- if not (.Values.prometheusRules.disabled.ShootReconciliationFailed | default false) }} + - alert: ShootReconciliationFailed + expr: garden_shoot_operation_states{operation="Reconcile"} != 1 + for: {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} - severity: {{ dig "ShootUnavailability" "severity" "warning" .Values.prometheusRules }} - playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md - service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} - support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + severity: {{ dig "ShootReconciliationFailed" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootReconciliationFailed.md + service: {{ dig "ShootReconciliationFailed" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. Need manual investigation of the root cause. Check the shoot and underlying infrastructure for issues. - summary: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. + description: Shoot {{`{{ $labels.name }}`}} is not being reconciled successfully. Check the shoot's conditions and events for more details. + summary: Shoot {{`{{ $labels.name }}`}} is not being reconciled successfully. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.ShootConditionNotTrue | default false) }} + - alert: ShootConditionNotTrue + expr: max by(name) (garden_shoot_condition != 1) + for: {{ dig "ShootConditionNotTrue" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "ShootConditionNotTrue" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootConditionNotTrue.md + service: {{ dig "ShootConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Shoot {{`{{ $labels.name }}`}} has a condition that is not True. Check the shoot's conditions and events for more details. + summary: Shoot {{`{{ $labels.name }}`}} has a condition that is not True. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.SeedConditionNotTrue | default false) }} + - alert: SeedConditionNotTrue + expr: max by(name) (garden_seed_condition != 1) + for: {{ dig "SeedConditionNotTrue" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "SeedConditionNotTrue" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/SeedConditionNotTrue.md + service: {{ dig "SeedConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "SeedConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Seed {{`{{ $labels.name }}`}} has a condition that is not True. Check the seed's conditions and events for more details. + summary: Seed {{`{{ $labels.name }}`}} has a condition that is not True. {{- end }} ### Calico ### From ee9dd6d8e8173e3679b6b00e3459c53871dee3ff Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Mon, 23 Mar 2026 16:27:22 +0100 Subject: [PATCH 2/6] Gardenlet alert --- .../alerts/controlplane-gardener.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index a9a8e78..17cf0de 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -49,6 +49,21 @@ groups: summary: Seed {{`{{ $labels.name }}`}} has a condition that is not True. {{- end }} +{{- if not (.Values.prometheusRules.disabled.GardenletNotReady | default false) }} + - alert: GardenletNotReady + expr: garden_garden_condition{status="True"} != 1 + for: {{ dig "GardenletNotReady" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "GardenletNotReady" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/GardenletNotReady.md + service: {{ dig "GardenletNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "GardenletNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. Check the gardenlet's conditions and events for more details. + summary: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. +{{- end }} + ### Calico ### {{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionDown | default false) }} From f4fc1a910e04e51c04eb1872ae155d8e08a36740 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 24 Mar 2026 14:39:57 +0100 Subject: [PATCH 3/6] refinement --- .../alerts/controlplane-gardener.yaml | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 17cf0de..d8922df 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -6,8 +6,7 @@ groups: {{- if not (.Values.prometheusRules.disabled.ShootReconciliationFailed | default false) }} - alert: ShootReconciliationFailed - expr: garden_shoot_operation_states{operation="Reconcile"} != 1 - for: {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} + expr: min_over_time((garden_shoot_operation_states{operation="Reconcile"} != 1 or garden_shoot_operation_states{operation="Reconcile"} != 2)[{{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }}:]) != 1 labels: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootReconciliationFailed" "severity" "warning" .Values.prometheusRules }} @@ -15,13 +14,13 @@ groups: service: {{ dig "ShootReconciliationFailed" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot {{`{{ $labels.name }}`}} is not being reconciled successfully. Check the shoot's conditions and events for more details. - summary: Shoot {{`{{ $labels.name }}`}} is not being reconciled successfully. + description: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully for {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} minutes. Check the shoot's conditions and events for more details. + summary: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully. {{- end }} {{- if not (.Values.prometheusRules.disabled.ShootConditionNotTrue | default false) }} - alert: ShootConditionNotTrue - expr: max by(name) (garden_shoot_condition != 1) + expr: max by(project,landscape,seed,name) (garden_shoot_condition != 1) for: {{ dig "ShootConditionNotTrue" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} @@ -30,13 +29,13 @@ groups: service: {{ dig "ShootConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Shoot {{`{{ $labels.name }}`}} has a condition that is not True. Check the shoot's conditions and events for more details. - summary: Shoot {{`{{ $labels.name }}`}} has a condition that is not True. + description: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. Check the Shoot's conditions and events for more details. + summary: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. {{- end }} {{- if not (.Values.prometheusRules.disabled.SeedConditionNotTrue | default false) }} - alert: SeedConditionNotTrue - expr: max by(name) (garden_seed_condition != 1) + expr: max by(landscape,name) (garden_seed_condition != 1) for: {{ dig "SeedConditionNotTrue" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} @@ -45,13 +44,13 @@ groups: service: {{ dig "SeedConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "SeedConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Seed {{`{{ $labels.name }}`}} has a condition that is not True. Check the seed's conditions and events for more details. - summary: Seed {{`{{ $labels.name }}`}} has a condition that is not True. + description: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True. Check the Seed's conditions and events for more details. + summary: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True. {{- end }} {{- if not (.Values.prometheusRules.disabled.GardenletNotReady | default false) }} - alert: GardenletNotReady - expr: garden_garden_condition{status="True"} != 1 + expr: max by(landscape) (garden_garden_condition{status="True"}) != 1 for: {{ dig "GardenletNotReady" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} @@ -60,7 +59,7 @@ groups: service: {{ dig "GardenletNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "GardenletNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. Check the gardenlet's conditions and events for more details. + description: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. Check the Gardenlet's conditions and events for more details. summary: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. {{- end }} @@ -68,7 +67,7 @@ groups: {{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionDown | default false) }} - alert: CalicoBgpNeighborSessionDown - expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }} + expr: sum by (cluster,node,pod) (bird_protocol_up{proto="BGP",state="Established"}) < {{ .Values.prometheusRules.calico.bgpNeighborCount }} for: {{ dig "CalicoBgpNeighborSessionDown" "for" "30m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} @@ -77,13 +76,13 @@ groups: service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? - summary: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. + description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? + summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }} - alert: CalicoBgpNeighborSessionAllDown - expr: sum by (node) (bird_protocol_up{proto="BGP",state="Established"}) == 0 + expr: sum by (cluster,node,pod) (bird_protocol_up{proto="BGP",state="Established"}) == 0 for: {{ dig "CalicoBgpNeighborSessionAllDown" "for" "10m" .Values.prometheusRules }} labels: {{ include "controlplane-operations.additionalRuleLabels" . }} @@ -92,8 +91,8 @@ groups: service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Node {{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? - summary: Node {{`{{ $labels.node }}`}} has no BGP neighbors. + description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? + summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }} @@ -107,8 +106,8 @@ groups: service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico is not running on all bare metal nodes. Network datapath threatened! - summary: Calico is not running on all nodes. + description: Calico is not running on all BareMetal nodes. Network datapath threatened! + summary: Calico is not running on all BareMetal nodes. {{- end }} {{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }} @@ -122,6 +121,6 @@ groups: service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: - description: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! - summary: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. + description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! + summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. {{- end }} From c46f2b6bc2da111e76b6debe09802f375b94096b Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 24 Mar 2026 14:41:37 +0100 Subject: [PATCH 4/6] chart bump --- charts/controlplane-operations/Chart.yaml | 2 +- charts/controlplane-operations/plugindefinition.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index df16ecf..4d5ed48 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.29 +version: 1.1.0 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index 933779a..112faad 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.29 + version: 1.1.0 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.29 + version: 1.1.0 options: - name: prometheusRules.create description: Create Prometheus rules From e6b4c8be7d68d3bd7822545163ee42720b10557e Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Tue, 24 Mar 2026 15:46:57 +0100 Subject: [PATCH 5/6] template service and support_group --- .../alerts/controlplane-gardener.yaml | 8 ++++---- .../templates/_helpers.tpl | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index d8922df..020502d 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -11,8 +11,8 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootReconciliationFailed" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootReconciliationFailed.md - service: {{ dig "ShootReconciliationFailed" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} - support_group: {{ dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ include "controlplane-operations.supportService" "`{{ $labels.project }}`" | default (dig "ShootReconciliationFailed" "service" .Values.prometheusRules.defaultService .Values.prometheusRules) }} + support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }} annotations: description: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully for {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} minutes. Check the shoot's conditions and events for more details. summary: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully. @@ -26,8 +26,8 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootConditionNotTrue" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootConditionNotTrue.md - service: {{ dig "ShootConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} - support_group: {{ dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + service: {{ include "controlplane-operations.supportService" "`{{ $labels.project }}`" | default (dig "ShootConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules) }} + support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }} annotations: description: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. Check the Shoot's conditions and events for more details. summary: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. diff --git a/charts/controlplane-operations/templates/_helpers.tpl b/charts/controlplane-operations/templates/_helpers.tpl index 555a753..821cf34 100644 --- a/charts/controlplane-operations/templates/_helpers.tpl +++ b/charts/controlplane-operations/templates/_helpers.tpl @@ -38,3 +38,23 @@ plugin: {{ $root.Release.Name }} {{- end }} {{- end }} {{- end }} + +{{- define "controlplane-operations.supportService" }} +{{- if eq "cp" . }} +{{- "cc-cp" }} +{{- else if eq "compute" . }} +{{- "metal-api" }} +{{- else if eq "storage" . }} +{{- "ceph" }} +{{- end }} +{{- end }} + +{{- define "controlplane-operations.supportGroup" }} +{{- if eq "cp" . }} +{{- "containers" }} +{{- else if eq "compute" . }} +{{- "compute" }} +{{- else if eq "storage" . }} +{{- "storage" }} +{{- end }} +{{- end }} From b4dd0be07ac2f61ef0f752b5aced63283eaecb37 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Wed, 25 Mar 2026 09:22:59 +0100 Subject: [PATCH 6/6] service label --- .../alerts/controlplane-gardener.yaml | 16 ++++++++-------- .../templates/_helpers.tpl | 10 ---------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index 020502d..8dc469a 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -11,7 +11,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootReconciliationFailed" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootReconciliationFailed.md - service: {{ include "controlplane-operations.supportService" "`{{ $labels.project }}`" | default (dig "ShootReconciliationFailed" "service" .Values.prometheusRules.defaultService .Values.prometheusRules) }} + service: gardener support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootReconciliationFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }} annotations: description: Shoot {{`{{ $labels.name }}`}} from project {{`{{ $labels.project }}`}} on {{`{{ $labels.landscape }}`}} is not being reconciled successfully for {{ dig "ShootReconciliationFailed" "for" "30m" .Values.prometheusRules }} minutes. Check the shoot's conditions and events for more details. @@ -26,7 +26,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "ShootConditionNotTrue" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootConditionNotTrue.md - service: {{ include "controlplane-operations.supportService" "`{{ $labels.project }}`" | default (dig "ShootConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules) }} + service: gardener support_group: {{ include "controlplane-operations.supportGroup" "`{{ $labels.project }}`" | default (dig "ShootConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules) }} annotations: description: Shoot {{`{{ $labels.name }}`}} of project {{`{{ $labels.project }}`}} seeded from {{`{{ $labels.landscape }}`}}/{{`{{ $labels.seed }}`}} has a condition that is not True. Check the Shoot's conditions and events for more details. @@ -41,7 +41,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "SeedConditionNotTrue" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/SeedConditionNotTrue.md - service: {{ dig "SeedConditionNotTrue" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: gardener support_group: {{ dig "SeedConditionNotTrue" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Seed {{`{{ $labels.name }}`}} on {{`{{ $labels.landscape }}`}} has a condition that is not True. Check the Seed's conditions and events for more details. @@ -56,7 +56,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "GardenletNotReady" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/GardenletNotReady.md - service: {{ dig "GardenletNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: gardener support_group: {{ dig "GardenletNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Gardenlet on {{`{{ $labels.landscape }}`}} is not Ready. Check the Gardenlet's conditions and events for more details. @@ -73,7 +73,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoBgpNeighborSessionDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionDown.md - service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration? @@ -88,7 +88,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoBgpNeighborSessionAllDown" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoBgpNeighborSessionAllDown.md - service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot/Node {{`{{ trimPrefix "shoot--cp--" $labels.cluster }}`}}/{{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration? @@ -103,7 +103,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoNodeMissing" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeMissing.md - service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Calico is not running on all BareMetal nodes. Network datapath threatened! @@ -118,7 +118,7 @@ groups: {{ include "controlplane-operations.additionalRuleLabels" . }} severity: {{ dig "CalicoNodeNotReady" "severity" "warning" .Values.prometheusRules }} playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/CalicoNodeNotReady.md - service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + service: calico support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} annotations: description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! diff --git a/charts/controlplane-operations/templates/_helpers.tpl b/charts/controlplane-operations/templates/_helpers.tpl index 821cf34..716cebd 100644 --- a/charts/controlplane-operations/templates/_helpers.tpl +++ b/charts/controlplane-operations/templates/_helpers.tpl @@ -39,16 +39,6 @@ plugin: {{ $root.Release.Name }} {{- end }} {{- end }} -{{- define "controlplane-operations.supportService" }} -{{- if eq "cp" . }} -{{- "cc-cp" }} -{{- else if eq "compute" . }} -{{- "metal-api" }} -{{- else if eq "storage" . }} -{{- "ceph" }} -{{- end }} -{{- end }} - {{- define "controlplane-operations.supportGroup" }} {{- if eq "cp" . }} {{- "containers" }}