Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.0.28
version: 1.0.29
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
22 changes: 11 additions & 11 deletions charts/controlplane-operations/alerts/controlplane-gardener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ groups:

{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }}
- alert: ShootUnavailability
expr: shoot:availability == 0
expr: shoot:availability{instance=~"https:\\/\\/api\\..+",container=""} == 0
for: {{ dig "ShootUnavailability" "for" "15m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
Expand All @@ -15,8 +15,8 @@ groups:
service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ShootUnavailability" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Shoot cluster(s) unavailability detected. Need manual investigation of the root cause. Check the shoot cluster(s) and underlying infrastructure for issues.
summary: Shoot cluster(s) unavailability detected.
description: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable. Need manual investigation of the root cause. Check the shoot and underlying infrastructure for issues.
summary: Shoot {{`{{ $labels.shoot_name }}`}} is unavailable.
{{- end }}

### Calico ###
Expand All @@ -32,8 +32,8 @@ groups:
service: {{ dig "CalicoBgpNeighborSessionDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoBgpNeighborSessionDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
summary: Node has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
description: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors. BGP peer is not established. Network datapath threatened! Switch upgrades or misconfiguration?
summary: Node {{`{{ $labels.node }}`}} has less than {{ .Values.prometheusRules.calico.bgpNeighborCount }} BGP neighbors.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoBgpNeighborSessionAllDown | default false) }}
Expand All @@ -47,8 +47,8 @@ groups:
service: {{ dig "CalicoBgpNeighborSessionAllDown" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoBgpNeighborSessionAllDown" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Node has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
summary: Node has no BGP neighbors.
description: Node {{`{{ $labels.node }}`}} has no BGP neighbors. Network datapath is down! Switch upgrades or misconfiguration?
summary: Node {{`{{ $labels.node }}`}} has no BGP neighbors.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoNodeMissing | default false) }}
Expand All @@ -62,8 +62,8 @@ groups:
service: {{ dig "CalicoNodeMissing" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoNodeMissing" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Calico is not running on all nodes.
summary: Calico is not running on all bare metal nodes. Network datapath threatened!
description: Calico is not running on all bare metal nodes. Network datapath threatened!
summary: Calico is not running on all nodes.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.CalicoNodeNotReady | default false) }}
Expand All @@ -77,6 +77,6 @@ groups:
service: {{ dig "CalicoNodeNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "CalicoNodeNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Calico-Node Pod is not Ready on all nodes.
summary: Calico-Node is not healthy on all bare metal nodes that are Ready. Risk of stale BGP advertisement. Network datapath threatened!
description: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened!
summary: Calico-Node Pod {{`{{ $labels.pod }}`}} on shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
{{- end }}
15 changes: 15 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,18 @@ groups:
description: VLAN-tagged ARP/IP traffic is filtered by ARPtables/IPtables on `{{`{{ $labels.node }}`}}`. Network datapath threatened!
summary: Bridged VLAN-tagged traffic is filtered by IPtables.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.KubernetesNodeKCPNotReady | default false) }}
- alert: KubernetesNodeKCPNotReady
expr: kube_node_status_condition{condition="Ready",node=~"kcp-.+",status="false"} == 1
for: {{ dig "KubernetesNodeKCPNotReady" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "KubernetesNodeKCPNotReady" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/KubernetesNodeKCPNotReady.md
service: {{ dig "KubernetesNodeKCPNotReady" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "KubernetesNodeKCPNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not ready. Check node conditions and events for more details.
summary: KCP node {{`{{ $labels.node }}`}} on cluster {{`{{ $labels.cluster }}`}} is not ready.
{{- end }}
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.0.28
version: 1.0.29
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.0.28
version: 1.0.29
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
Loading