utilitywarehouse · hectorhuertas · Apr 12, 2024 · Apr 12, 2024
@@ -7,6 +7,7 @@ configMapGenerator:
       - canary.yaml
       - daemonset.yaml.tmpl=daemonset.yaml.tmpl
       - logging.yaml.tmpl=logging.yaml.tmpl
+      - metrics.yaml.tmpl
       - vault.yaml.tmpl
 
       - stock/container.yaml.tmpl

@@ -0,0 +1,18 @@
+# PROMETHEUS RULES
+# DO NOT REMOVE line above, used in `pre-commit` hook
+
+groups:
+  - name: metrics
+    rules:
+        # thanos-compact is a slow crasher, so it needs a more sensitive
+        # "ContainerRestartingOften" alert than the stock one
+      - alert: ThanosCompactRestartingOften
+        expr: increase(kube_pod_container_status_restarts_total{container="thanos-compact"}[2h]) > 3
+        labels:
+          team: infra
+        annotations:
+          summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
+          impact: "Container may be crashlooping and not working as expected"
+          action: "Check pod status and container logs to figure out if there's a problem"
+          command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"
+          logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"
@@ -4,15 +4,15 @@
 groups:
   - name: Container
     rules:
-      # Set period to 2h to capture slow crashing containers like
-      # thanos-compact that take a long time to start up
+      # Set period to 30m to cater to most workloads. Slow crashing containers
+      # like `thanos-compact` need to setup a dedicated alert.
       - alert: ContainerRestartingOften
-        expr: (increase(kube_pod_container_status_restarts_total[2h]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
+        expr: (increase(kube_pod_container_status_restarts_total[30m]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
         labels:
           alerttype: stock
           alertgroup: container
         annotations:
-          summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
+          summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 30m"
           impact: "Container may be crashlooping and not working as expected"
           action: "Check pod status and container logs to figure out if there's a problem"
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"