Skip to content

Commit 2e8bebb

Browse files
committed
Reduce ContainerRestarting window in stock alerts
1 parent 4585e6a commit 2e8bebb

File tree

3 files changed

+23
-4
lines changed

3 files changed

+23
-4
lines changed

common/kustomization.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ configMapGenerator:
1010
- vault.yaml.tmpl
1111

1212
- stock/container.yaml.tmpl
13+
- stock/metrics.yaml.tmpl
1314
- stock/missing_replicas.yaml.tmpl
1415
- stock/namespace_sync.yaml.tmpl
1516
- stock/storage.yaml.tmpl

common/metrics.yaml.tmpl

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# PROMETHEUS RULES
2+
# DO NOT REMOVE line above, used in `pre-commit` hook
3+
4+
groups:
5+
- name: metrics
6+
rules:
7+
# thanos-compact is a slow crasher, so it needs a more sensitive
8+
# "ContainerRestartingOften" alert than the stock one
9+
- alert: ThanosCompactRestartingOften
10+
expr: increase(kube_pod_container_status_restarts_total{container="thanos-compact"}[2h]) > 3
11+
labels:
12+
team: infra
13+
annotations:
14+
summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
15+
impact: "Container may be crashlooping and not working as expected"
16+
action: "Check pod status and container logs to figure out if there's a problem"
17+
command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"
18+
logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"

common/stock/container.yaml.tmpl

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
groups:
55
- name: Container
66
rules:
7-
# Set period to 2h to capture slow crashing containers like
8-
# thanos-compact that take a long time to start up
7+
# Set period to 30m to cater to most workloads. Slow crashing containers
8+
# like `thanos-compact` need to setup a dedicated alert.
99
- alert: ContainerRestartingOften
10-
expr: (increase(kube_pod_container_status_restarts_total[2h]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
10+
expr: (increase(kube_pod_container_status_restarts_total[30m]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
1111
labels:
1212
alerttype: stock
1313
alertgroup: container
1414
annotations:
15-
summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
15+
summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 30m"
1616
impact: "Container may be crashlooping and not working as expected"
1717
action: "Check pod status and container logs to figure out if there's a problem"
1818
command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"

0 commit comments

Comments
 (0)