Reduce ContainerRestarting window in stock alerts

hectorhuertas · hectorhuertas · commit 2e8bebbeff2d · 2024-04-12T11:24:59.000+02:00
diff --git a/common/kustomization.yaml b/common/kustomization.yaml
@@ -10,6 +10,7 @@ configMapGenerator:
       - vault.yaml.tmpl
 
       - stock/container.yaml.tmpl
+      - stock/metrics.yaml.tmpl
       - stock/missing_replicas.yaml.tmpl
       - stock/namespace_sync.yaml.tmpl
       - stock/storage.yaml.tmpl
diff --git a/common/metrics.yaml.tmpl b/common/metrics.yaml.tmpl
@@ -0,0 +1,18 @@
+# PROMETHEUS RULES
+# DO NOT REMOVE line above, used in `pre-commit` hook
+
+groups:
+  - name: metrics
+    rules:
+        # thanos-compact is a slow crasher, so it needs a more sensitive
+        # "ContainerRestartingOften" alert than the stock one
+      - alert: ThanosCompactRestartingOften
+        expr: increase(kube_pod_container_status_restarts_total{container="thanos-compact"}[2h]) > 3
+        labels:
+          team: infra
+        annotations:
+          summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
+          impact: "Container may be crashlooping and not working as expected"
+          action: "Check pod status and container logs to figure out if there's a problem"
+          command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"
+          logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"
diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl
@@ -4,15 +4,15 @@
 groups:
   - name: Container
     rules:
-      # Set period to 2h to capture slow crashing containers like
-      # thanos-compact that take a long time to start up
+      # Set period to 30m to cater to most workloads. Slow crashing containers
+      # like `thanos-compact` need to setup a dedicated alert.
       - alert: ContainerRestartingOften
-        expr: (increase(kube_pod_container_status_restarts_total[2h]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
+        expr: (increase(kube_pod_container_status_restarts_total[30m]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
         labels:
           alerttype: stock
           alertgroup: container
         annotations:
-          summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
+          summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 30m"
           impact: "Container may be crashlooping and not working as expected"
           action: "Check pod status and container logs to figure out if there's a problem"
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"