From 7b00e1f4a7d51898f771031b313813ede9b0f99b Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 27 Oct 2023 15:57:51 +0200 Subject: [PATCH 01/20] Add uw_namespace_owner rule --- common/kustomization.yaml | 1 + common/stock/team_detection.yaml | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 common/stock/team_detection.yaml diff --git a/common/kustomization.yaml b/common/kustomization.yaml index b6d5be1..6a68911 100644 --- a/common/kustomization.yaml +++ b/common/kustomization.yaml @@ -13,6 +13,7 @@ configMapGenerator: - stock/namespace_sync.yaml.tmpl - stock/storage.yaml.tmpl - stock/terraform_sync.yaml.tmpl + - stock/team_detection.yaml - stock/vault-clients.yaml.tmpl name: alert-templates-common diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml new file mode 100644 index 0000000..17de9bf --- /dev/null +++ b/common/stock/team_detection.yaml @@ -0,0 +1,5 @@ +groups: + - name: team_detection + rules: + - record: uw_namespace_owner + expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)")) From dc4c205ad8ff9c0f7b65f4c38559b2a9c81bafb1 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Mon, 30 Oct 2023 10:41:29 +0100 Subject: [PATCH 02/20] Remove sum --- common/stock/team_detection.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index 17de9bf..094016e 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -2,4 +2,5 @@ groups: - name: team_detection rules: - record: uw_namespace_owner - expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)")) + expr: label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)") + #expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)")) From 47f422f0bb690dd5743d555fc44d84a369ba4423 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Wed, 15 Nov 2023 15:51:15 +0100 Subject: [PATCH 03/20] Add uw_namespace_team rule --- common/stock/team_detection.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index 094016e..5b5b6be 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -2,5 +2,6 @@ groups: - name: team_detection rules: - record: uw_namespace_owner - expr: label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)") - #expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)")) + expr: label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner)") + - record: uw_namespace_team + expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system)")) From 459105400748730537288d5e497c17f3ec0c9f62 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Thu, 16 Nov 2023 12:22:27 +0100 Subject: [PATCH 04/20] Update alerts to test changes --- common/stock/namespace_sync.yaml.tmpl | 9 ++++++--- common/stock/team_detection.yaml | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl index aaeef07..f02f2ae 100644 --- a/common/stock/namespace_sync.yaml.tmpl +++ b/common/stock/namespace_sync.yaml.tmpl @@ -6,19 +6,22 @@ groups: rules: - alert: ArgoCDApplicationNotSynced expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1 + #expr: argocd_app_info * on (namespace) group_left(team) sum by (namespace, team) (label_replace(kube_namespace_labels{job!="opencost"}, "team", "$1", "label_uw_systems_owner", "(system|partner)")) for: 1h labels: - group: namespace_sync + alerttype: stock + alertgroup: namespace_sync annotations: summary: "Argo CD application {{$labels.name}} is not synced for last 1h" impact: "Some manifests won't be automatically deployed." action: "Check the web UI / logs for errors." link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}} - alert: ArgoCDApplicationAutoSyncDisabled - expr: argocd_app_info{autosync_enabled="false"} == 1 + expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team for: 1h labels: - group: namespace_sync + alerttype: stock + alertgroup: namespace_sync annotations: summary: "Auto Sync on Argo CD application {{$labels.name}} is disabled" impact: "New manifests won't be automatically deployed." diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index 5b5b6be..d4e9f29 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -3,5 +3,6 @@ groups: rules: - record: uw_namespace_owner expr: label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner)") + # used as: `metric_with_namespace * on (namespace) group_left (team) uw_namespace_team - record: uw_namespace_team expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system)")) From 1d4a6fbdecaeb4e9ce4cefe4a781f81c1b55bcfe Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Thu, 16 Nov 2023 12:47:35 +0100 Subject: [PATCH 05/20] Patch for testing --- common/stock/namespace_sync.yaml.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl index f02f2ae..d1e7f49 100644 --- a/common/stock/namespace_sync.yaml.tmpl +++ b/common/stock/namespace_sync.yaml.tmpl @@ -18,7 +18,7 @@ groups: link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}} - alert: ArgoCDApplicationAutoSyncDisabled expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team - for: 1h + #for: 1h labels: alerttype: stock alertgroup: namespace_sync From 6693bb96a8ec64c48fa91190f8883c6c89419fdc Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 17 Nov 2023 11:39:17 +0100 Subject: [PATCH 06/20] Remove testing patch --- common/stock/namespace_sync.yaml.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl index d1e7f49..f02f2ae 100644 --- a/common/stock/namespace_sync.yaml.tmpl +++ b/common/stock/namespace_sync.yaml.tmpl @@ -18,7 +18,7 @@ groups: link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}} - alert: ArgoCDApplicationAutoSyncDisabled expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team - #for: 1h + for: 1h labels: alerttype: stock alertgroup: namespace_sync From a606b3205c2f6cc7c72a06db115e7fdc79d13460 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 17 Nov 2023 11:42:07 +0100 Subject: [PATCH 07/20] Remove unnecessary recording rule --- common/stock/team_detection.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index d4e9f29..544fd21 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -1,8 +1,10 @@ +# PROMETHEUS RULES +# DO NOT REMOVE line above, used in `pre-commit` hook + groups: - name: team_detection rules: - - record: uw_namespace_owner - expr: label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner)") - # used as: `metric_with_namespace * on (namespace) group_left (team) uw_namespace_team + # used as: ` * on (namespace) group_left + # (team) uw_namespace_team` - record: uw_namespace_team - expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system)")) + expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)")) From b69488544823c8ca864ff952784150e257e79c5d Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 17 Nov 2023 11:44:52 +0100 Subject: [PATCH 08/20] Remove partial changes --- common/stock/namespace_sync.yaml.tmpl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl index f02f2ae..2d484d2 100644 --- a/common/stock/namespace_sync.yaml.tmpl +++ b/common/stock/namespace_sync.yaml.tmpl @@ -6,11 +6,9 @@ groups: rules: - alert: ArgoCDApplicationNotSynced expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1 - #expr: argocd_app_info * on (namespace) group_left(team) sum by (namespace, team) (label_replace(kube_namespace_labels{job!="opencost"}, "team", "$1", "label_uw_systems_owner", "(system|partner)")) for: 1h labels: - alerttype: stock - alertgroup: namespace_sync + group: namespace_sync annotations: summary: "Argo CD application {{$labels.name}} is not synced for last 1h" impact: "Some manifests won't be automatically deployed." From f1539fce15ce42544ec2723122dd923af47488cc Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Tue, 21 Nov 2023 17:02:42 +0100 Subject: [PATCH 09/20] Update readme --- common/stock/README.md | 77 ++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/common/stock/README.md b/common/stock/README.md index 7749f9d..4c92e4c 100644 --- a/common/stock/README.md +++ b/common/stock/README.md @@ -6,51 +6,78 @@ to consume. Stock alerts are already setup and "firing" for all teams, and they only need to be consumed by teams, if they choose to. -To consume the stock alerts, add a new route on alertmanager filtering your -namespaces and pointing to your receiver. The `matchers` clause follows the -usual prometheus syntax. +To consume the alerts, there are 2 options: +* Opt into automatic team detection based on namespace owner (recommended) +* Create a new alertmanager route to manually consume alerts for whatever + namespace you care about -Example (with recommended grouping): +### Automatic team detection based on `uw.systems/owner` namespace label +Stock alerts can generate a `team` label with the value of the +`uw.systems/owner` label of the namespace that the alert belongs to. + +To opt-in into this team detection, add your `uw.systems/owner` value to the +regex at the end of +https://github.com/utilitywarehouse/system-alerts/blob/main/common/stock/team_detection.yaml#L10. +Ask in #infra if you need help setting this up. + +Now the stock alerts for your namespaces will have a `team` label and be +captured by your existing router. + +### Manual configuration of specific namespaces +Add a new route on alertmanager filtering your namespaces and pointing to your +receiver. The `matchers` clause follows the usual prometheus syntax. + +Example: ``` route: + ... routes: - - matchers: ['{namespace=~"myteam-.*"}'] + ... + - matchers: ['{alerttype="stock", namespace=~"myteam-.*|alsoimportant"}'] receiver: myteam-receiver - group_by: ["alertname", "namespace", "deployment", "statefulset"] + ... ``` -If you want to opt out of some alerts, you can have a subroute matching what -you don't want and sending it to the `deadletter` receiver. +### Note on grouping alerts +Stock alerts use the default grouping found at the top of the [alertmanager +config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12) -Example ignoring some alerts: +If your team alerts need different grouping, you can configure it by adding a +subroute for your alerts with your custom grouping: ``` route: + ... routes: - - matchers: ['{namespace=~"myteam-.*"}'] + ... + - matchers: ['{team="myteam"}'] receiver: myteam-receiver - group_by: ["alertname", "namespace", "deployment", "statefulset"] routes: - # Example of ignoring some alerts by sending them to `deadletter` - - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}'] - receiver: deadletter + # Example of custom grouping for non-stock alerts + - matchers: ['{alerttype!="stock"}'] + group_by: ["your", "custom", "grouping"] + ... ``` -## Note on `team` label and catching-non-stock alerts -Matchers filtering only by namespace can also match team's own alerts, which -could be undesired. If you need different configuration for stock alerts and -your team dedicated alerts, you need to tweak the filters. +### Opting out of some alerts +If you want to opt out of some alerts, you can have a subroute matching what +you don't want and sending it to the `deadletter` receiver. -Example of different routes for stock and team alerts(assuming team alerts use -`team` label): +Example ignoring some alerts: ``` route: + ... routes: - - matchers: ['{namespace=~"myteam-.*", team=""}'] - receiver: myteam-receiver-for-stock-alerts - ... + ... - matchers: ['{team="myteam"}'] - receiver: myteam-receiver-for-team-alerts - ... + receiver: myteam-receiver + routes: + # Example of ignoring some alerts by sending them to `deadletter` + - matchers: ['{alertgroup="storage"}'] + receiver: deadletter + # Example of ignoring some alerts by sending them to `deadletter` + - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}'] + receiver: deadletter + ... ``` ## Notes for @system From ac4ca8c0aca6d9fc8a813ecb2f05dfa41e7b36df Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Thu, 23 Nov 2023 11:44:43 +0100 Subject: [PATCH 10/20] Add uw_namespace_oncall_team rule --- common/stock/team_detection.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index 544fd21..4c90cec 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -8,3 +8,6 @@ groups: # (team) uw_namespace_team` - record: uw_namespace_team expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)")) + # used as: ` * on (namespace) group_left (team) uw_namespace_oncall_team` + - record: uw_namespace_oncall_team + expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)")) From 186649e96af4fdc07020487846de88eae3a64992 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Thu, 23 Nov 2023 11:52:26 +0100 Subject: [PATCH 11/20] Update alert labels --- common/stock/container.yaml.tmpl | 9 ++++++--- common/stock/missing_replicas.yaml.tmpl | 18 ++++++++++++------ common/stock/namespace_sync.yaml.tmpl | 9 ++++++--- common/stock/storage.yaml.tmpl | 6 ++++-- common/stock/terraform_sync.yaml.tmpl | 3 ++- common/stock/vault-clients.yaml.tmpl | 6 ++++-- 6 files changed, 34 insertions(+), 17 deletions(-) diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl index 5ecc1a3..eac61b7 100644 --- a/common/stock/container.yaml.tmpl +++ b/common/stock/container.yaml.tmpl @@ -9,7 +9,8 @@ groups: - alert: ContainerRestartingOften expr: increase(kube_pod_container_status_restarts_total[2h]) > 3 labels: - group: container + alerttype: stock + alertgroup: container annotations: summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h" impact: "Container may be crashlooping and not working as expected" @@ -21,7 +22,8 @@ groups: expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95 for: 15m labels: - group: container + alerttype: stock + alertgroup: container annotations: summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} is being CPU throttled." impact: "Container might take longer than normal to respond to requests." @@ -31,7 +33,8 @@ groups: expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) kube_pod_container_status_ready == 0 for: 5m labels: - group: container + alerttype: stock + alertgroup: container annotations: summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has been OOMKilled recently and it's not ready" impact: "Container not ready, may affect service uptime" diff --git a/common/stock/missing_replicas.yaml.tmpl b/common/stock/missing_replicas.yaml.tmpl index 7b37bc0..126979f 100644 --- a/common/stock/missing_replicas.yaml.tmpl +++ b/common/stock/missing_replicas.yaml.tmpl @@ -8,7 +8,8 @@ groups: expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{} for: 15m labels: - group: missing_replicas + alerttype: stock + alertgroup: missing_replicas annotations: summary: "Deployment {{$labels.namespace}}/{{$labels.deployment}} has missing replicas for 15m" impact: "Workload may be unavailable or have lost high availability" @@ -18,7 +19,8 @@ groups: expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{} for: 15m labels: - group: missing_replicas + alerttype: stock + alertgroup: missing_replicas annotations: summary: "Statefulset {{$labels.namespace}}/{{$labels.statefulset}} has missing replicas for 15m" impact: "Workload may be unavailable or have lost high availability" @@ -29,7 +31,8 @@ groups: expr: (kube_daemonset_status_number_ready != kube_daemonset_status_desired_number_scheduled) and changes(kube_daemonset_status_updated_number_scheduled[10m]) == 0 for: 5m labels: - group: missing_replicas + alerttype: stock + alertgroup: missing_replicas annotations: summary: "Daemonset {{$labels.namespace}}/{{$labels.daemonset}} has missing replicas" impact: "Workload unavailable on some nodes" @@ -39,7 +42,8 @@ groups: expr: (kube_deployment_status_replicas_available == 0 and kube_deployment_status_replicas != 0) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{} for: 5m labels: - group: missing_replicas + alerttype: stock + alertgroup: missing_replicas annotations: summary: "Deployment {{$labels.namespace}}/{{$labels.deployment}} has 0 healthy replicas." impact: "Workload is down" @@ -49,7 +53,8 @@ groups: expr: (kube_statefulset_status_replicas_ready == 0 and kube_statefulset_status_replicas != 0) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{} for: 5m labels: - group: missing_replicas + alerttype: stock + alertgroup: missing_replicas annotations: summary: "Statefulset {{$labels.namespace}}/{{$labels.statefulset}} has 0 healthy replicas." impact: "Workload is down" @@ -59,7 +64,8 @@ groups: expr: (kube_daemonset_status_number_ready == 0 and kube_daemonset_status_desired_number_scheduled != 0) for: 5m labels: - group: missing_replicas + alerttype: stock + alertgroup: missing_replicas annotations: summary: "Daemonset {{$labels.namespace}}/{{$labels.daemonset}} has 0 healthy replicas." impact: "Workload is down" diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl index 2d484d2..7fce46b 100644 --- a/common/stock/namespace_sync.yaml.tmpl +++ b/common/stock/namespace_sync.yaml.tmpl @@ -8,7 +8,8 @@ groups: expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1 for: 1h labels: - group: namespace_sync + alerttype: stock + alertgroup: namespace_sync annotations: summary: "Argo CD application {{$labels.name}} is not synced for last 1h" impact: "Some manifests won't be automatically deployed." @@ -33,7 +34,8 @@ groups: - alert: ArgoCDApplicationSyncFailure expr: increase(argocd_app_sync_total{phase=~"Error|Failed"}[1h]) > 0 labels: - group: namespace_sync + alerttype: stock + alertgroup: namespace_sync annotations: summary: "Argo CD application {{$labels.name}} Sync failed" impact: "Some manifests won't be automatically deployed." @@ -43,7 +45,8 @@ groups: expr: kube_applier_last_run_success != 1 for: 1h10m labels: - group: namespace_sync + alerttype: stock + alertgroup: namespace_sync annotations: summary: "kube-applier encountered errors while applying {{ $labels.namespace }}" impact: Some manifest won't be automatically deployed. diff --git a/common/stock/storage.yaml.tmpl b/common/stock/storage.yaml.tmpl index 656da7b..6371b94 100644 --- a/common/stock/storage.yaml.tmpl +++ b/common/stock/storage.yaml.tmpl @@ -8,7 +8,8 @@ groups: expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 72 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5 for: 2h labels: - group: storage + alerttype: stock + alertgroup: storage annotations: summary: "Volume {{$labels.namespace}}/{{$labels.persistentvolumeclaim}} will fill up in 72h" impact: "Exhausting available disk space will most likely result in service disruption" @@ -18,7 +19,8 @@ groups: expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 6 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5 for: 30m labels: - group: storage + alerttype: stock + alertgroup: storage annotations: summary: "Volume {{$labels.namespace}}/{{$labels.persistentvolumeclaim}} will fill up in 6h" impact: "Exhausting available disk space will most likely result in service disruption" diff --git a/common/stock/terraform_sync.yaml.tmpl b/common/stock/terraform_sync.yaml.tmpl index b26dd4c..4b04fb1 100644 --- a/common/stock/terraform_sync.yaml.tmpl +++ b/common/stock/terraform_sync.yaml.tmpl @@ -8,7 +8,8 @@ groups: expr: terraform_applier_module_last_run_success == 0 for: 1h10m labels: - group: terraform_sync + alerttype: stock + alertgroup: terraform_sync annotations: summary: "Terraform module {{ $labels.module }} in {{ $labels.namespace }} encountered errors during last terraform run" description: | diff --git a/common/stock/vault-clients.yaml.tmpl b/common/stock/vault-clients.yaml.tmpl index 9ae29f8..ad8d1f2 100644 --- a/common/stock/vault-clients.yaml.tmpl +++ b/common/stock/vault-clients.yaml.tmpl @@ -9,7 +9,8 @@ groups: expr: time() - vkcc_sidecar_expiry_timestamp_seconds > 0 for: 10m labels: - group: vault_clients + alerttype: stock + alertgroup: vault_clients annotations: description: | The credentials served by the vault credentials agent sidecar have expired and have not @@ -20,7 +21,8 @@ groups: expr: (kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) kube_pod_status_scheduled{condition="true"} == 1) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"} for: 10m labels: - group: vault_clients + alerttype: stock + alertgroup: vault_clients annotations: description: | The pod is annotated with `{{ $labels.key }}={{ $labels.value }}` but does not have a From 2f79d6e0d220f67a410b412eda2d196fd19ee460 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Thu, 23 Nov 2023 12:25:52 +0100 Subject: [PATCH 12/20] Update container alerts with team detection --- common/stock/container.yaml.tmpl | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl index eac61b7..5d355be 100644 --- a/common/stock/container.yaml.tmpl +++ b/common/stock/container.yaml.tmpl @@ -7,7 +7,16 @@ groups: # Set period to 2h to capture slow crashing containers like # thanos-compact that take a long time to start up - alert: ContainerRestartingOften +<<<<<<< HEAD expr: increase(kube_pod_container_status_restarts_total[2h]) > 3 +======= + expr: | + ( + increase(kube_pod_container_status_restarts_total[10m]) > 3 + ) + * on (namespace) group_left(team) uw_namespace_oncall_team + keep_firing_for: 10m +>>>>>>> f37f257 (Update container alerts with team detection) labels: alerttype: stock alertgroup: container @@ -19,7 +28,20 @@ groups: logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D" # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867 - alert: ContainerCpuThrottled +<<<<<<< HEAD expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95 +======= + # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867 + expr: | + ( + ( + sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) + ) > 0.95 + ) + * on (namespace) group_left(team) uw_namespace_oncall_team +>>>>>>> f37f257 (Update container alerts with team detection) for: 15m labels: alerttype: stock @@ -30,7 +52,13 @@ groups: action: "Investigate CPU consumption and adjust pods resources if needed." dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/VAE0wIcik/kubernetes-pod-resources?orgId=1&refresh=1m&from=now-12h&to=now&var-instance=All&var-namespace={{ $labels.namespace }}" - alert: ContainerOOMing - expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) kube_pod_container_status_ready == 0 + expr: | + ( + kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} + and on (container,pod) + (kube_pod_container_status_ready == 0) + ) + * on (namespace) group_left(team) uw_namespace_oncall_team for: 5m labels: alerttype: stock From 49947865eb5f3e9bd7a9a81ccad5459b24ee8ba4 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Thu, 23 Nov 2023 14:46:21 +0100 Subject: [PATCH 13/20] Fix uw_namespace_oncall_team --- common/stock/team_detection.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index 4c90cec..4a78594 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -10,4 +10,6 @@ groups: expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)")) # used as: ` * on (namespace) group_left (team) uw_namespace_oncall_team` - record: uw_namespace_oncall_team - expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)")) + expr: sum by (namespace, team) (label_replace(kube_namespace_annotations{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)")) + #- record: uw_namespace_oncall_team + # expr: sum by (namespace, team) (label_replace(kube_namespace_annotations, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)")) From 18bb084b1bf09a9111135ded07745ce37fe57f58 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 11:04:19 +0100 Subject: [PATCH 14/20] Update readme --- common/stock/README.md | 92 +++++++++--------------------------------- 1 file changed, 19 insertions(+), 73 deletions(-) diff --git a/common/stock/README.md b/common/stock/README.md index 4c92e4c..695b4bd 100644 --- a/common/stock/README.md +++ b/common/stock/README.md @@ -3,42 +3,38 @@ Common and useful alerts maintained by @system and readily available for teams to consume. ## Usage -Stock alerts are already setup and "firing" for all teams, and they only need -to be consumed by teams, if they choose to. +Stock alerts are already setup and "firing" for all namespaces. Teams only need +to claim namespace oncall responsibility to receive them. -To consume the alerts, there are 2 options: -* Opt into automatic team detection based on namespace owner (recommended) -* Create a new alertmanager route to manually consume alerts for whatever - namespace you care about +To do so, add a `uw.systems/oncall-team` annotation to namespaces to claim +them, and the team will receive the stock alerts for those namespaces +automatically. -### Automatic team detection based on `uw.systems/owner` namespace label -Stock alerts can generate a `team` label with the value of the -`uw.systems/owner` label of the namespace that the alert belongs to. +Namespace definitions live in `kubernetes-manifests//kube-system/namespaces.yaml` -To opt-in into this team detection, add your `uw.systems/owner` value to the -regex at the end of -https://github.com/utilitywarehouse/system-alerts/blob/main/common/stock/team_detection.yaml#L10. -Ask in #infra if you need help setting this up. - -Now the stock alerts for your namespaces will have a `team` label and be -captured by your existing router. - -### Manual configuration of specific namespaces -Add a new route on alertmanager filtering your namespaces and pointing to your -receiver. The `matchers` clause follows the usual prometheus syntax. +### Opting out of some alerts +If you want to opt out of some alerts, you can have a subroute matching what +you don't want and sending it to the `deadletter` receiver. -Example: +Example ignoring some alerts: ``` route: ... routes: ... - - matchers: ['{alerttype="stock", namespace=~"myteam-.*|alsoimportant"}'] + - matchers: ['{team="myteam"}'] receiver: myteam-receiver + routes: + # Example of ignoring some alerts by sending them to `deadletter` + - matchers: ['{alertgroup="storage"}'] + receiver: deadletter + # Example of ignoring some alerts by sending them to `deadletter` + - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}'] + receiver: deadletter ... ``` -### Note on grouping alerts +### Note on alert grouping Stock alerts use the default grouping found at the top of the [alertmanager config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12) @@ -57,53 +53,3 @@ route: group_by: ["your", "custom", "grouping"] ... ``` - -### Opting out of some alerts -If you want to opt out of some alerts, you can have a subroute matching what -you don't want and sending it to the `deadletter` receiver. - -Example ignoring some alerts: -``` -route: - ... - routes: - ... - - matchers: ['{team="myteam"}'] - receiver: myteam-receiver - routes: - # Example of ignoring some alerts by sending them to `deadletter` - - matchers: ['{alertgroup="storage"}'] - receiver: deadletter - # Example of ignoring some alerts by sending them to `deadletter` - - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}'] - receiver: deadletter - ... -``` - -## Notes for @system - -### Note on `namespace` vs `kubernetes_namespace` labels -There are two possible "namespace" labels in metrics -* `namespace` is the namespace labeled by a metric inside it's exporter. It is - relevant in metrics exposed by workloads that are aware of kubernetes - namespace as a concept, like metrics coming from argocd or - kube-state-metrics. -* `kubernetes_namespace` is the namespace where the metric was scraped from. - This is relevant in metrics exposed by workloads that do not deal with - kubernetes namespaces as a concept. - -Currently all the metrics used in the stock alerts rely only on `namespace` label, but this may change in the future. We could either add a second matcher or relabel `kubernetes_namespace` to `namespace` in all metrics where `namespace` is not set. - -### Note for @system team configuration -There are metrics that have a @system `kubernetes_namespace` but a non-@system -`namespace`, like an argocd metric that comes from a @system namespace but is -talking about another team's namespace (`argocd_app_info{sync_status!="Synced", -kubernetes_namespace="sys-argo-cd", namespace="billing"} 1`). - -However, there are no metrics that have a @system `namespace` but a non-@system -`kubernetes_namespace`. This can be verified by running `group by (__name__) -({namespace=~"kube-system|sys-.*", -kubernetes_namespace!~"|kube-system|sys-.*"})` and getting no results. - -For this reasons, @system needs to be more careful with it's matchers and ensure -it's not catching team's metrics by accident. From 09f846f4424ac3c6729e0d7e99b4cef0df760d74 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 11:06:49 +0100 Subject: [PATCH 15/20] Update container alerts --- common/stock/container.yaml.tmpl | 34 +++----------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl index 5d355be..e616458 100644 --- a/common/stock/container.yaml.tmpl +++ b/common/stock/container.yaml.tmpl @@ -7,16 +7,7 @@ groups: # Set period to 2h to capture slow crashing containers like # thanos-compact that take a long time to start up - alert: ContainerRestartingOften -<<<<<<< HEAD - expr: increase(kube_pod_container_status_restarts_total[2h]) > 3 -======= - expr: | - ( - increase(kube_pod_container_status_restarts_total[10m]) > 3 - ) - * on (namespace) group_left(team) uw_namespace_oncall_team - keep_firing_for: 10m ->>>>>>> f37f257 (Update container alerts with team detection) + expr: (increase(kube_pod_container_status_restarts_total[2h]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team labels: alerttype: stock alertgroup: container @@ -28,20 +19,7 @@ groups: logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D" # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867 - alert: ContainerCpuThrottled -<<<<<<< HEAD - expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95 -======= - # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867 - expr: | - ( - ( - sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) - ) > 0.95 - ) - * on (namespace) group_left(team) uw_namespace_oncall_team ->>>>>>> f37f257 (Update container alerts with team detection) + expr: ((sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)) > 0.95) * on (namespace) group_left(team) uw_namespace_oncall_team for: 15m labels: alerttype: stock @@ -52,13 +30,7 @@ groups: action: "Investigate CPU consumption and adjust pods resources if needed." dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/VAE0wIcik/kubernetes-pod-resources?orgId=1&refresh=1m&from=now-12h&to=now&var-instance=All&var-namespace={{ $labels.namespace }}" - alert: ContainerOOMing - expr: | - ( - kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} - and on (container,pod) - (kube_pod_container_status_ready == 0) - ) - * on (namespace) group_left(team) uw_namespace_oncall_team + expr: (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) (kube_pod_container_status_ready == 0)) * on (namespace) group_left(team) uw_namespace_oncall_team for: 5m labels: alerttype: stock From 4d70942a490c31aa96adf98c6589b3f8b11a0b3b Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 11:20:49 +0100 Subject: [PATCH 16/20] Update missing_replias alerts --- common/stock/missing_replicas.yaml.tmpl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/stock/missing_replicas.yaml.tmpl b/common/stock/missing_replicas.yaml.tmpl index 126979f..a1bf45b 100644 --- a/common/stock/missing_replicas.yaml.tmpl +++ b/common/stock/missing_replicas.yaml.tmpl @@ -5,7 +5,7 @@ groups: - name: MissingReplicas rules: - alert: DeploymentMissingReplicas - expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{} + expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations * on (namespace) group_left(team) uw_namespace_oncall_team for: 15m labels: alerttype: stock @@ -16,7 +16,7 @@ groups: action: "Check why some replicas are not healthy" command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe deployment {{ $labels.deployment }}" - alert: StatefulsetMissingReplicas - expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{} + expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations * on (namespace) group_left(team) uw_namespace_oncall_team for: 15m labels: alerttype: stock @@ -28,7 +28,7 @@ groups: command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe statefulset {{ $labels.statefulset }}" - alert: DaemonsetMissingReplicas # Alert if there are unhealthy replicas and the ds is not updating it's replicas - expr: (kube_daemonset_status_number_ready != kube_daemonset_status_desired_number_scheduled) and changes(kube_daemonset_status_updated_number_scheduled[10m]) == 0 + expr: ((kube_daemonset_status_number_ready != kube_daemonset_status_desired_number_scheduled) and (changes(kube_daemonset_status_updated_number_scheduled[10m]) == 0)) * on (namespace) group_left(team) uw_namespace_oncall_team for: 5m labels: alerttype: stock @@ -39,7 +39,7 @@ groups: action: "Check why some replicas are not healthy" command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe daemonset {{ $labels.daemonset }}" - alert: DeploymentMissingAllReplicas - expr: (kube_deployment_status_replicas_available == 0 and kube_deployment_status_replicas != 0) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{} + expr: (kube_deployment_status_replicas_available == 0 and kube_deployment_status_replicas != 0) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations * on (namespace) group_left(team) uw_namespace_oncall_team for: 5m labels: alerttype: stock @@ -50,7 +50,7 @@ groups: action: "Check why all replicas are missing" command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe deployment {{ $labels.deployment }}" - alert: StatefulsetMissingAllReplicas - expr: (kube_statefulset_status_replicas_ready == 0 and kube_statefulset_status_replicas != 0) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{} + expr: (kube_statefulset_status_replicas_ready == 0 and kube_statefulset_status_replicas != 0) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations * on (namespace) group_left(team) uw_namespace_oncall_team for: 5m labels: alerttype: stock @@ -61,7 +61,7 @@ groups: action: "Check why all replicas are missing" command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe statefulset {{ $labels.statefulset }}" - alert: DaemonsetMissingAllReplicas - expr: (kube_daemonset_status_number_ready == 0 and kube_daemonset_status_desired_number_scheduled != 0) + expr: (kube_daemonset_status_number_ready == 0 and kube_daemonset_status_desired_number_scheduled != 0) * on (namespace) group_left(team) uw_namespace_oncall_team for: 5m labels: alerttype: stock From 2bf0e765d5e3ac871f896042570bdf7d4ae35d39 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 11:26:33 +0100 Subject: [PATCH 17/20] Update more alerts --- common/stock/namespace_sync.yaml.tmpl | 8 ++++---- common/stock/storage.yaml.tmpl | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl index 7fce46b..7f37525 100644 --- a/common/stock/namespace_sync.yaml.tmpl +++ b/common/stock/namespace_sync.yaml.tmpl @@ -5,7 +5,7 @@ groups: - name: NamespaceSync rules: - alert: ArgoCDApplicationNotSynced - expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1 + expr: (argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1) * on (namespace) group_left(team) uw_namespace_oncall_team for: 1h labels: alerttype: stock @@ -16,7 +16,7 @@ groups: action: "Check the web UI / logs for errors." link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}} - alert: ArgoCDApplicationAutoSyncDisabled - expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team + expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_oncall_team for: 1h labels: alerttype: stock @@ -32,7 +32,7 @@ groups: `kubectl --context {{$labels.kubernetes_cluster}} -n {{$labels.namespace}} patch --type='merge' applications.argoproj.io {{$labels.name}} -p "{\"spec\":{\"syncPolicy\":{\"automated\":{\"prune\":false,\"selfHeal\":false}}}}"` link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}} - alert: ArgoCDApplicationSyncFailure - expr: increase(argocd_app_sync_total{phase=~"Error|Failed"}[1h]) > 0 + expr: (increase(argocd_app_sync_total{phase=~"Error|Failed"}[1h]) > 0) * on (namespace) group_left(team) uw_namespace_oncall_team labels: alerttype: stock alertgroup: namespace_sync @@ -42,7 +42,7 @@ groups: action: "Check the web UI / logs for errors." link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}} - alert: KubeApplierErrors - expr: kube_applier_last_run_success != 1 + expr: (kube_applier_last_run_success != 1) * on (namespace) group_left(team) uw_namespace_oncall_team for: 1h10m labels: alerttype: stock diff --git a/common/stock/storage.yaml.tmpl b/common/stock/storage.yaml.tmpl index 6371b94..f0bcb7f 100644 --- a/common/stock/storage.yaml.tmpl +++ b/common/stock/storage.yaml.tmpl @@ -5,7 +5,7 @@ groups: - name: Storage rules: - alert: VolumeFillingUpin72h - expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 72 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5 + expr: ((predict_linear(kubelet_volume_stats_available_bytes[1h], 72 * 3600) < 0) and (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5)) * on (namespace) group_left(team) uw_namespace_oncall_team for: 2h labels: alerttype: stock @@ -16,7 +16,7 @@ groups: action: "Investigate disk usage and adjust volume size if necessary." dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/919b92a8e8041bd567af9edab12c840c/kubernetes-persistent-volumes?orgId=1&refresh=10s&var-datasource=default&var-cluster=${ENVIRONMENT}-${PROVIDER}&var-namespace={{ $labels.namespace }}&var-volume={{ $labels.persistentvolumeclaim }}" - alert: VolumeFillingUpin6h - expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 6 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5 + expr: ((predict_linear(kubelet_volume_stats_available_bytes[1h], 6 * 3600) < 0) and (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5)) * on (namespace) group_left(team) uw_namespace_oncall_team for: 30m labels: alerttype: stock From 0f402d08950598bc61fb34e8133a36d33157a971 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 11:26:45 +0100 Subject: [PATCH 18/20] Drop unnecesary rule --- common/stock/team_detection.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml index 4a78594..e17db67 100644 --- a/common/stock/team_detection.yaml +++ b/common/stock/team_detection.yaml @@ -4,12 +4,7 @@ groups: - name: team_detection rules: - # used as: ` * on (namespace) group_left - # (team) uw_namespace_team` - - record: uw_namespace_team - expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)")) # used as: ` * on (namespace) group_left (team) uw_namespace_oncall_team` - record: uw_namespace_oncall_team + # job filtering is needed to avoid duplicated `kube_namespace_annotations` from `opencost` job expr: sum by (namespace, team) (label_replace(kube_namespace_annotations{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)")) - #- record: uw_namespace_oncall_team - # expr: sum by (namespace, team) (label_replace(kube_namespace_annotations, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)")) From 15e8987bc0f20bbfbc9357d78e957ea4aba90a15 Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 11:28:52 +0100 Subject: [PATCH 19/20] Update more alerts --- common/stock/terraform_sync.yaml.tmpl | 3 ++- common/stock/vault-clients.yaml.tmpl | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/common/stock/terraform_sync.yaml.tmpl b/common/stock/terraform_sync.yaml.tmpl index 4b04fb1..307da13 100644 --- a/common/stock/terraform_sync.yaml.tmpl +++ b/common/stock/terraform_sync.yaml.tmpl @@ -5,7 +5,8 @@ groups: - name: TerraformSync rules: - alert: TerraformApplierErrors - expr: terraform_applier_module_last_run_success == 0 + expr: (terraform_applier_module_last_run_success == 0) * on (namespace) group_left(team) uw_namespace_oncall_team + for: 1h10m labels: alerttype: stock diff --git a/common/stock/vault-clients.yaml.tmpl b/common/stock/vault-clients.yaml.tmpl index ad8d1f2..cf69421 100644 --- a/common/stock/vault-clients.yaml.tmpl +++ b/common/stock/vault-clients.yaml.tmpl @@ -6,7 +6,7 @@ groups: # Recommendations from https://s3-us-west-2.amazonaws.com/hashicorp-education/whitepapers/Vault/Vault-Consul-Monitoring-Guide.pdf rules: - alert: VaultSidecarCredentialsExpired - expr: time() - vkcc_sidecar_expiry_timestamp_seconds > 0 + expr: (time() - vkcc_sidecar_expiry_timestamp_seconds > 0) * on (namespace) group_left(team) uw_namespace_oncall_team for: 10m labels: alerttype: stock @@ -18,7 +18,7 @@ groups: summary: "The credentials for '{{ $labels.kubernetes_pod_name }}' have expired" dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars - alert: VaultSidecarMissing - expr: (kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) kube_pod_status_scheduled{condition="true"} == 1) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"} + expr: ((kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) (kube_pod_status_scheduled{condition="true"} == 1)) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"}) * on (namespace) group_left(team) uw_namespace_oncall_team for: 10m labels: alerttype: stock From 710be6350a5bf1008b0e3646180babf4a936170d Mon Sep 17 00:00:00 2001 From: Hector Huertas Date: Fri, 24 Nov 2023 12:04:05 +0100 Subject: [PATCH 20/20] Update docs about alert grouping --- common/stock/README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/common/stock/README.md b/common/stock/README.md index 695b4bd..02b330a 100644 --- a/common/stock/README.md +++ b/common/stock/README.md @@ -35,11 +35,9 @@ route: ``` ### Note on alert grouping -Stock alerts use the default grouping found at the top of the [alertmanager -config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12) - -If your team alerts need different grouping, you can configure it by adding a -subroute for your alerts with your custom grouping: +If your team is using a custom grouping that is missing entries from the +default grouping (set at the top of the [alertmanager +config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12)), it is suggested to configure stock alerts to use the stock grouping: ``` route: ... @@ -48,8 +46,8 @@ route: - matchers: ['{team="myteam"}'] receiver: myteam-receiver routes: - # Example of custom grouping for non-stock alerts - - matchers: ['{alerttype!="stock"}'] - group_by: ["your", "custom", "grouping"] + # Example of specifying grouping for stock alerts, using the yaml alias + - matchers: ['{alerttype="stock"}'] + group_by: *stock_grouping ... ```