File tree 3 files changed +23
-4
lines changed
3 files changed +23
-4
lines changed Original file line number Diff line number Diff line change @@ -10,6 +10,7 @@ configMapGenerator:
10
10
- vault.yaml.tmpl
11
11
12
12
- stock/container.yaml.tmpl
13
+ - stock/metrics.yaml.tmpl
13
14
- stock/missing_replicas.yaml.tmpl
14
15
- stock/namespace_sync.yaml.tmpl
15
16
- stock/storage.yaml.tmpl
Original file line number Diff line number Diff line change
1
+ # PROMETHEUS RULES
2
+ # DO NOT REMOVE line above, used in `pre-commit` hook
3
+
4
+ groups:
5
+ - name: metrics
6
+ rules:
7
+ # thanos-compact is a slow crasher, so it needs a more sensitive
8
+ # "ContainerRestartingOften" alert than the stock one
9
+ - alert: ThanosCompactRestartingOften
10
+ expr: increase(kube_pod_container_status_restarts_total{container="thanos-compact"}[2h]) > 3
11
+ labels:
12
+ team: infra
13
+ annotations:
14
+ summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
15
+ impact: "Container may be crashlooping and not working as expected"
16
+ action: "Check pod status and container logs to figure out if there's a problem"
17
+ command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"
18
+ logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"
Original file line number Diff line number Diff line change 4
4
groups:
5
5
- name: Container
6
6
rules:
7
- # Set period to 2h to capture slow crashing containers like
8
- # thanos-compact that take a long time to start up
7
+ # Set period to 30m to cater to most workloads. Slow crashing containers
8
+ # like ` thanos-compact` need to setup a dedicated alert.
9
9
- alert: ContainerRestartingOften
10
- expr: (increase(kube_pod_container_status_restarts_total[2h ]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
10
+ expr: (increase(kube_pod_container_status_restarts_total[30m ]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
11
11
labels:
12
12
alerttype: stock
13
13
alertgroup: container
14
14
annotations:
15
- summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h "
15
+ summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 30m "
16
16
impact: "Container may be crashlooping and not working as expected"
17
17
action: "Check pod status and container logs to figure out if there's a problem"
18
18
command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe pod {{ $labels.pod }}"
You can’t perform that action at this time.
0 commit comments