diff --git a/charts/controlplane-operations/Chart.yaml b/charts/controlplane-operations/Chart.yaml index 6f21490..fa5e998 100644 --- a/charts/controlplane-operations/Chart.yaml +++ b/charts/controlplane-operations/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: controlplane-operations -version: 1.0.13 +version: 1.0.14 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters. maintainers: - name: Vladimir Videlov (d051408) diff --git a/charts/controlplane-operations/alerts/controlplane-gardener.yaml b/charts/controlplane-operations/alerts/controlplane-gardener.yaml new file mode 100644 index 0000000..6a2d8f0 --- /dev/null +++ b/charts/controlplane-operations/alerts/controlplane-gardener.yaml @@ -0,0 +1,17 @@ +groups: +- name: controlplane-gardener + rules: +{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }} + - alert: ShootUnavailability + expr: shoot:availability == 0 + for: {{ dig "ShootUnavailability" "for" "10m" .Values.prometheusRules }} + labels: + {{ include "controlplane-operations.additionalRuleLabels" . }} + severity: {{ dig "ShootUnavailability" "severity" "info" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md + service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }} + support_group: {{ dig "ShootUnavailability" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }} + annotations: + description: Shoot cluster is unavailable for more than 10 minutes. + summary: Shoot cluster is unavailable for more than 10 minutes. +{{- end }} diff --git a/charts/controlplane-operations/alerts/controlplane-remote.yaml b/charts/controlplane-operations/alerts/controlplane-remote.yaml index fa51e0e..3d29a86 100644 --- a/charts/controlplane-operations/alerts/controlplane-remote.yaml +++ b/charts/controlplane-operations/alerts/controlplane-remote.yaml @@ -1,5 +1,5 @@ groups: -- name: controlplane-remote.alerts +- name: controlplane-remote rules: {{- if not (.Values.prometheusRules.disabled.ArgoraUpdateInError | default false) }} - alert: ArgoraUpdateInError diff --git a/charts/controlplane-operations/plugindefinition.yaml b/charts/controlplane-operations/plugindefinition.yaml index c7fdfd8..8b6f6db 100644 --- a/charts/controlplane-operations/plugindefinition.yaml +++ b/charts/controlplane-operations/plugindefinition.yaml @@ -3,7 +3,7 @@ kind: PluginDefinition metadata: name: controlplane-operations spec: - version: 1.0.13 + version: 1.0.14 displayName: Controlplane operations bundle description: Operations bundle for Controlane clusters docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md @@ -11,7 +11,7 @@ spec: helmChart: name: controlplane-operations repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts - version: 1.0.13 + version: 1.0.14 options: - name: prometheusRules.create description: Create Prometheus rules diff --git a/charts/controlplane-operations/templates/alerts.yaml b/charts/controlplane-operations/templates/alerts.yaml index 7d14ddf..f192f24 100644 --- a/charts/controlplane-operations/templates/alerts.yaml +++ b/charts/controlplane-operations/templates/alerts.yaml @@ -11,6 +11,9 @@ metadata: {{- if $.Values.prometheusRules.labels }} {{ toYaml $.Values.prometheusRules.labels | indent 4 }} {{- end }} +{{- if and $.Values.prometheusRules.gardenerRuleLabels (contains "gardener" $path) }} +{{ toYaml $.Values.prometheusRules.gardenerRuleLabels | indent 4 }} +{{- end }} {{- if $.Values.prometheusRules.annotations }} annotations: {{ toYaml $.Values.prometheusRules.annotations | indent 4 }} diff --git a/charts/controlplane-operations/values.yaml b/charts/controlplane-operations/values.yaml index 978d5fc..64ca625 100644 --- a/charts/controlplane-operations/values.yaml +++ b/charts/controlplane-operations/values.yaml @@ -36,6 +36,9 @@ prometheusRules: ## This is useful for adding additional labels of alerts to each rule additionalRuleLabels: {} + ## Additional labels for Gardener related PrometheusRule alerts + gardenerRuleLabels: {} + ## Additional annotations for PrometheusRule alerts additionalRuleAnnotations: {} @@ -44,10 +47,10 @@ prometheusRules: # KubernetesApiServerDown: true # KubeletDown: true - # NodeVirtualInterfaceDown: - # service: "cc-cp" - # supportGroup: "containers" - # for: "15m" + # ServerStuckInDiscovery: + # service: "metal-api" + # supportGroup: "foundation" + # for: "10m" # severity: "warning" ## Create default dashboards for monitoring the cluster