diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index a64ecea..297446b 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.1.3 +version: 1.1.5 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml index b316936..ca09261 100644 --- a/charts/controlplane-operations/alerts/controlplane-gardener.yaml +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -224,3 +224,151 @@ groups: description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened! summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. {{- end }} + +### MCM ### + +{{- if not (.Values.prometheusRules.disabled.MCMMachineNotReady | default false) }} + - alert: MCMMachineNotReady + expr: > + sum by (name, project, shoot_name, support_group) ( + label_replace( + label_replace( + label_replace( + mcm_machine_status_condition{ + job="machine-controller-manager", + condition="Ready" + }, + "support_group", "containers", "__name__", ".*" + ), + "support_group", "storage", "project", "^storage$" + ), + "support_group", "compute", "project", "^compute$" + ) + ) == 0 + for: {{ dig "MCMMachineNotReady" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "MCMMachineNotReady" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineNotReady.md + service: gardener + support_group: "{{`{{ $labels.support_group }}`}}" + annotations: + description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready. Check the Machine's conditions and events for more details. + summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInTerminating | default false) }} + - alert: MCMMachineStuckInTerminating + expr: > + sum by (name, project, shoot_name, support_group) ( + label_replace( + label_replace( + label_replace( + mcm_machine_current_status_phase{ + job="machine-controller-manager", + }, + "support_group", "containers", "__name__", ".*" + ), + "support_group", "storage", "project", "^storage$" + ), + "support_group", "compute", "project", "^compute$" + ) + ) == -4 + for: {{ dig "MCMMachineStuckInTerminating" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "MCMMachineStuckInTerminating" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineStuckInTerminating.md + service: gardener + support_group: "{{`{{ $labels.support_group }}`}}" + annotations: + description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state. Check the Machine's conditions and events for more details. + summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.MCMMachineFailed | default false) }} + - alert: MCMMachineFailed + expr: > + sum by (name, project, shoot_name, support_group) ( + label_replace( + label_replace( + label_replace( + mcm_machine_current_status_phase{ + job="machine-controller-manager", + }, + "support_group", "containers", "__name__", ".*" + ), + "support_group", "storage", "project", "^storage$" + ), + "support_group", "compute", "project", "^compute$" + ) + ) == -3 + for: {{ dig "MCMMachineFailed" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "MCMMachineFailed" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineFailed.md + service: gardener + support_group: "{{`{{ $labels.support_group }}`}}" + annotations: + description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state. Check the Machine's conditions and events for more details. + summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.MCMMachineCrashLoopBackOff | default false) }} + - alert: MCMMachineCrashLoopBackOff + expr: > + sum by (name, project, shoot_name, support_group) ( + label_replace( + label_replace( + label_replace( + mcm_machine_current_status_phase{ + job="machine-controller-manager", + }, + "support_group", "containers", "__name__", ".*" + ), + "support_group", "storage", "project", "^storage$" + ), + "support_group", "compute", "project", "^compute$" + ) + ) == -2 + for: {{ dig "MCMMachineCrashLoopBackOff" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "MCMMachineCrashLoopBackOff" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineCrashLoopBackOff.md + service: gardener + support_group: "{{`{{ $labels.support_group }}`}}" + annotations: + description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state. Check the Machine's conditions and events for more details. + summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state. +{{- end }} + +{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInPending | default false) }} + - alert: MCMMachineStuckInPending + expr: > + sum by (name, project, shoot_name, support_group) ( + label_replace( + label_replace( + label_replace( + mcm_machine_current_status_phase{ + job="machine-controller-manager", + }, + "support_group", "containers", "__name__", ".*" + ), + "support_group", "storage", "project", "^storage$" + ), + "support_group", "compute", "project", "^compute$" + ) + ) == 0 + for: {{ dig "MCMMachineStuckInPending" "for" "30m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "MCMMachineStuckInPending" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineStuckInPending.md + service: gardener + support_group: "{{`{{ $labels.support_group }}`}}" + annotations: + description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state. Check the Machine's conditions and events for more details. + summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state. +{{- end }} diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index f1ce914..8b9a4c7 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.1.3 + version: 1.1.5 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.1.3 + version: 1.1.5 options: - name: prometheusRules.create description: Create Prometheus rules