From 38611a0ce879fe69ed21fa236ffa8790a7eef842 Mon Sep 17 00:00:00 2001 From: Tobias Wackenhut Date: Tue, 21 Nov 2023 14:48:23 +0100 Subject: [PATCH] Move modified HPA alert to helm chart * Rename the KubeHpaMaxedOut alert to KubeHpaMaxedOutMultiPod * Include deployment of this alert in kube-prometheus-stack helm chart Signed-off-by: Tobias Wackenhut --- CHANGELOG.md | 5 +++++ monitoring/bin/deploy_monitoring_cluster.sh | 9 -------- monitoring/kube-hpa-alert-patch.json | 7 ------- monitoring/values-prom-operator.yaml | 23 +++++++++++++++++++++ 4 files changed, 28 insertions(+), 16 deletions(-) delete mode 100644 monitoring/kube-hpa-alert-patch.json diff --git a/CHANGELOG.md b/CHANGELOG.md index c4dce3fc..8b431ff9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # SAS Viya Monitoring for Kubernetes +## Unreleased + +* **Metrics** + * [CHANGE] The KubeHpaMaxedOut alert has (effectively) been renamed KubeHpaMaxedOutMultiPod + ## Version 1.2.20 (12DEC2023) * **Metrics** diff --git a/monitoring/bin/deploy_monitoring_cluster.sh b/monitoring/bin/deploy_monitoring_cluster.sh index 8c797c02..7b99b601 100755 --- a/monitoring/bin/deploy_monitoring_cluster.sh +++ b/monitoring/bin/deploy_monitoring_cluster.sh @@ -320,15 +320,6 @@ for f in monitoring/rules/viya/rules-*.yaml; do kubectl apply -n $MON_NS -f $f done -kubectl get prometheusrule -n $MON_NS v4m-kubernetes-apps 2>/dev/null -if [ $? == 0 ]; then - log_verbose "Patching KubeHpaMaxedOut rule" - # Fixes the issue of false positives when max replicas == 1 - kubectl patch prometheusrule --type='json' -n $MON_NS v4m-kubernetes-apps --patch "$(cat monitoring/kube-hpa-alert-patch.json)" -else - log_debug "PrometheusRule $MON_NS/v4m-kubernetes-apps does not exist" -fi - # Elasticsearch Datasource for Grafana LOGGING_DATASOURCE="${LOGGING_DATASOURCE:-false}" if [ "$LOGGING_DATASOURCE" == "true" ]; then diff --git a/monitoring/kube-hpa-alert-patch.json b/monitoring/kube-hpa-alert-patch.json deleted file mode 100644 index 7b423909..00000000 --- a/monitoring/kube-hpa-alert-patch.json +++ /dev/null @@ -1,7 +0,0 @@ -[ - { - "op" : "replace", - "path" : "/spec/groups/0/rules/14/expr", - "value" : "(kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} == kube_horizontalpodautoscaler_spec_max_replicas{job=\"kube-state-metrics\",namespace=~\".*\"}) and kube_horizontalpodautoscaler_status_current_replicas{job=\"kube-state-metrics\",namespace=~\".*\"} > 1" - } -] \ No newline at end of file diff --git a/monitoring/values-prom-operator.yaml b/monitoring/values-prom-operator.yaml index 737b8535..f606bb6d 100644 --- a/monitoring/values-prom-operator.yaml +++ b/monitoring/values-prom-operator.yaml @@ -12,6 +12,29 @@ commonLabels: sas.com/monitoring-base: kube-viya-monitoring +defaultRules: + disabled: + KubeHpaMaxedOut: true + +additionalPrometheusRulesMap: + sas-modified-default-rules: + groups: + - name: kubernetes-apps + rules: + - alert: KubeHpaMaxedOutMultiPod + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} + has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"} + == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics",namespace=~".*"}) + and kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics",namespace=~".*"} + > 1 + for: 15m + labels: + severity: warning + # =================== # Prometheus Operator # ===================