Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.0.13
version: 1.0.14
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
groups:
- name: controlplane-gardener
rules:
{{- if not (.Values.prometheusRules.disabled.ShootUnavailability | default false) }}
- alert: ShootUnavailability
expr: shoot:availability == 0
for: {{ dig "ShootUnavailability" "for" "10m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "ShootUnavailability" "severity" "info" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ShootUnavailability.md
service: {{ dig "ShootUnavailability" "service" .Values.prometheusRules.defaultService .Values.prometheusRules }}
support_group: {{ dig "ShootUnavailability" "supportGroup" .Values.prometheusRules.defaultSupportGroup .Values.prometheusRules }}
annotations:
description: Shoot cluster is unavailable for more than 10 minutes.
summary: Shoot cluster is unavailable for more than 10 minutes.
{{- end }}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
groups:
- name: controlplane-remote.alerts
- name: controlplane-remote
rules:
{{- if not (.Values.prometheusRules.disabled.ArgoraUpdateInError | default false) }}
- alert: ArgoraUpdateInError
Expand Down
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.0.13
version: 1.0.14
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.0.13
version: 1.0.14
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
3 changes: 3 additions & 0 deletions charts/controlplane-operations/templates/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ metadata:
{{- if $.Values.prometheusRules.labels }}
{{ toYaml $.Values.prometheusRules.labels | indent 4 }}
{{- end }}
{{- if and $.Values.prometheusRules.gardenerRuleLabels (contains "gardener" $path) }}
{{ toYaml $.Values.prometheusRules.gardenerRuleLabels | indent 4 }}
{{- end }}
{{- if $.Values.prometheusRules.annotations }}
annotations:
{{ toYaml $.Values.prometheusRules.annotations | indent 4 }}
Expand Down
11 changes: 7 additions & 4 deletions charts/controlplane-operations/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ prometheusRules:
## This is useful for adding additional labels of alerts to each rule
additionalRuleLabels: {}

## Additional labels for Gardener related PrometheusRule alerts
gardenerRuleLabels: {}

## Additional annotations for PrometheusRule alerts
additionalRuleAnnotations: {}

Expand All @@ -44,10 +47,10 @@ prometheusRules:
# KubernetesApiServerDown: true
# KubeletDown: true

# NodeVirtualInterfaceDown:
# service: "cc-cp"
# supportGroup: "containers"
# for: "15m"
# ServerStuckInDiscovery:
# service: "metal-api"
# supportGroup: "foundation"
# for: "10m"
# severity: "warning"

## Create default dashboards for monitoring the cluster
Expand Down
Loading