Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.1.3
version: 1.1.4
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
123 changes: 123 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-gardener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,126 @@ groups:
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened!
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
{{- end }}

### MCM ###

{{- if not (.Values.prometheusRules.disabled.MCMMachineNotReady | default false) }}
- alert: MCMMachineNotReady
expr: >
sum by (name, shoot) (
label_replace(
mcm_machine_status_condition{
job="machine-controller-manager",
condition="Ready"
},
"shoot", "$1",
"namespace", "shoot--cp--(.*)"
)
) == 0
for: {{ dig "MCMMachineNotReady" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineNotReady" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineNotReady.md
service: gardener
support_group: {{ dig "MCMMachineNotReady" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is not Ready. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is not Ready.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInTerminating | default false) }}
- alert: MCMMachineStuckInTerminating
expr: >
sum by (name, shoot) (
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"shoot", "$1",
"namespace", "shoot--cp--(.*)"
)
) == -4
for: {{ dig "MCMMachineStuckInTerminating" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineStuckInTerminating" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineStuckInTerminating.md
service: gardener
support_group: {{ dig "MCMMachineStuckInTerminating" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is stuck in Terminating state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is stuck in Terminating state.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineFailed | default false) }}
- alert: MCMMachineFailed
expr: >
sum by (name, shoot) (
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"shoot", "$1",
"namespace", "shoot--cp--(.*)"
)
) == -3
for: {{ dig "MCMMachineFailed" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineFailed" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineFailed.md
service: gardener
support_group: {{ dig "MCMMachineFailed" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is in Failed state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is in Failed state.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineCrashLoopBackOff | default false) }}
- alert: MCMMachineCrashLoopBackOff
expr: >
sum by (name, shoot) (
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"shoot", "$1",
"namespace", "shoot--cp--(.*)"
)
) == -2
for: {{ dig "MCMMachineCrashLoopBackOff" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineCrashLoopBackOff" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineCrashLoopBackOff.md
service: gardener
support_group: {{ dig "MCMMachineCrashLoopBackOff" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is in CrashLoopBackOff state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is in CrashLoopBackOff state.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInPending | default false) }}
- alert: MCMMachineStuckInPending
expr: >
sum by (name, shoot) (
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"shoot", "$1",
"namespace", "shoot--cp--(.*)"
)
) == 0
for: {{ dig "MCMMachineStuckInPending" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineStuckInPending" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineStuckInPending.md
service: gardener
support_group: {{ dig "MCMMachineStuckInPending" "support_group" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is stuck in Pending state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot }}`}} is stuck in Pending state.
{{- end }}
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.1.3
version: 1.1.4
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.1.3
version: 1.1.4
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
Loading