Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ns owner #57

Merged
merged 20 commits into from
Nov 24, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update container alerts
hectorhuertas committed Nov 24, 2023
commit 09f846f4424ac3c6729e0d7e99b4cef0df760d74
34 changes: 3 additions & 31 deletions common/stock/container.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -7,16 +7,7 @@ groups:
# Set period to 2h to capture slow crashing containers like
# thanos-compact that take a long time to start up
- alert: ContainerRestartingOften
<<<<<<< HEAD
expr: increase(kube_pod_container_status_restarts_total[2h]) > 3
=======
expr: |
(
increase(kube_pod_container_status_restarts_total[10m]) > 3
)
* on (namespace) group_left(team) uw_namespace_oncall_team
keep_firing_for: 10m
>>>>>>> f37f257 (Update container alerts with team detection)
expr: (increase(kube_pod_container_status_restarts_total[2h]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
labels:
alerttype: stock
alertgroup: container
@@ -28,20 +19,7 @@ groups:
logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"
# https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867
- alert: ContainerCpuThrottled
<<<<<<< HEAD
expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95
=======
# https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867
expr: |
(
(
sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)
) > 0.95
)
* on (namespace) group_left(team) uw_namespace_oncall_team
>>>>>>> f37f257 (Update container alerts with team detection)
expr: ((sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)) > 0.95) * on (namespace) group_left(team) uw_namespace_oncall_team
for: 15m
labels:
alerttype: stock
@@ -52,13 +30,7 @@ groups:
action: "Investigate CPU consumption and adjust pods resources if needed."
dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/VAE0wIcik/kubernetes-pod-resources?orgId=1&refresh=1m&from=now-12h&to=now&var-instance=All&var-namespace={{ $labels.namespace }}"
- alert: ContainerOOMing
expr: |
(
kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}
and on (container,pod)
(kube_pod_container_status_ready == 0)
)
* on (namespace) group_left(team) uw_namespace_oncall_team
expr: (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) (kube_pod_container_status_ready == 0)) * on (namespace) group_left(team) uw_namespace_oncall_team
for: 5m
labels:
alerttype: stock