From 7b00e1f4a7d51898f771031b313813ede9b0f99b Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 27 Oct 2023 15:57:51 +0200
Subject: [PATCH 01/20] Add uw_namespace_owner rule

---
 common/kustomization.yaml        | 1 +
 common/stock/team_detection.yaml | 5 +++++
 2 files changed, 6 insertions(+)
 create mode 100644 common/stock/team_detection.yaml

diff --git a/common/kustomization.yaml b/common/kustomization.yaml
index b6d5be1..6a68911 100644
--- a/common/kustomization.yaml
+++ b/common/kustomization.yaml
@@ -13,6 +13,7 @@ configMapGenerator:
       - stock/namespace_sync.yaml.tmpl
       - stock/storage.yaml.tmpl
       - stock/terraform_sync.yaml.tmpl
+      - stock/team_detection.yaml
       - stock/vault-clients.yaml.tmpl
     name: alert-templates-common
 
diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
new file mode 100644
index 0000000..17de9bf
--- /dev/null
+++ b/common/stock/team_detection.yaml
@@ -0,0 +1,5 @@
+groups:
+  - name: team_detection
+    rules:
+      - record: uw_namespace_owner
+        expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)"))

From dc4c205ad8ff9c0f7b65f4c38559b2a9c81bafb1 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Mon, 30 Oct 2023 10:41:29 +0100
Subject: [PATCH 02/20] Remove sum

---
 common/stock/team_detection.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index 17de9bf..094016e 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -2,4 +2,5 @@ groups:
   - name: team_detection
     rules:
       - record: uw_namespace_owner
-        expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)"))
+        expr: label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)")
+          #expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)"))

From 47f422f0bb690dd5743d555fc44d84a369ba4423 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Wed, 15 Nov 2023 15:51:15 +0100
Subject: [PATCH 03/20] Add uw_namespace_team rule

---
 common/stock/team_detection.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index 094016e..5b5b6be 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -2,5 +2,6 @@ groups:
   - name: team_detection
     rules:
       - record: uw_namespace_owner
-        expr: label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)")
-          #expr: sum by (namespace, team) (label_replace(kube_namespace_labels, "team", "$1", "label_uw_systems_owner", "(system|partner)"))
+        expr: label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner)")
+      - record: uw_namespace_team
+        expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system)"))

From 459105400748730537288d5e497c17f3ec0c9f62 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Thu, 16 Nov 2023 12:22:27 +0100
Subject: [PATCH 04/20] Update alerts to test changes

---
 common/stock/namespace_sync.yaml.tmpl | 9 ++++++---
 common/stock/team_detection.yaml      | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl
index aaeef07..f02f2ae 100644
--- a/common/stock/namespace_sync.yaml.tmpl
+++ b/common/stock/namespace_sync.yaml.tmpl
@@ -6,19 +6,22 @@ groups:
     rules:
       - alert: ArgoCDApplicationNotSynced
         expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1
+          #expr: argocd_app_info * on (namespace) group_left(team) sum by (namespace, team) (label_replace(kube_namespace_labels{job!="opencost"}, "team", "$1", "label_uw_systems_owner", "(system|partner)"))
         for: 1h
         labels:
-          group: namespace_sync
+          alerttype: stock
+          alertgroup: namespace_sync
         annotations:
           summary: "Argo CD application {{$labels.name}} is not synced for last 1h"
           impact: "Some manifests won't be automatically deployed."
           action: "Check the web UI / logs for errors."
           link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}}
       - alert: ArgoCDApplicationAutoSyncDisabled
-        expr: argocd_app_info{autosync_enabled="false"} == 1
+        expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team
         for: 1h
         labels:
-          group: namespace_sync
+          alerttype: stock
+          alertgroup: namespace_sync
         annotations:
           summary: "Auto Sync on Argo CD application {{$labels.name}} is disabled"
           impact: "New manifests won't be automatically deployed."
diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index 5b5b6be..d4e9f29 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -3,5 +3,6 @@ groups:
     rules:
       - record: uw_namespace_owner
         expr: label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner)")
+      # used as: `metric_with_namespace * on (namespace) group_left (team) uw_namespace_team
       - record: uw_namespace_team
         expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system)"))

From 1d4a6fbdecaeb4e9ce4cefe4a781f81c1b55bcfe Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Thu, 16 Nov 2023 12:47:35 +0100
Subject: [PATCH 05/20] Patch for testing

---
 common/stock/namespace_sync.yaml.tmpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl
index f02f2ae..d1e7f49 100644
--- a/common/stock/namespace_sync.yaml.tmpl
+++ b/common/stock/namespace_sync.yaml.tmpl
@@ -18,7 +18,7 @@ groups:
           link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}}
       - alert: ArgoCDApplicationAutoSyncDisabled
         expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team
-        for: 1h
+          #for: 1h
         labels:
           alerttype: stock
           alertgroup: namespace_sync

From 6693bb96a8ec64c48fa91190f8883c6c89419fdc Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 17 Nov 2023 11:39:17 +0100
Subject: [PATCH 06/20] Remove testing patch

---
 common/stock/namespace_sync.yaml.tmpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl
index d1e7f49..f02f2ae 100644
--- a/common/stock/namespace_sync.yaml.tmpl
+++ b/common/stock/namespace_sync.yaml.tmpl
@@ -18,7 +18,7 @@ groups:
           link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}}
       - alert: ArgoCDApplicationAutoSyncDisabled
         expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team
-          #for: 1h
+        for: 1h
         labels:
           alerttype: stock
           alertgroup: namespace_sync

From a606b3205c2f6cc7c72a06db115e7fdc79d13460 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 17 Nov 2023 11:42:07 +0100
Subject: [PATCH 07/20] Remove unnecessary recording rule

---
 common/stock/team_detection.yaml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index d4e9f29..544fd21 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -1,8 +1,10 @@
+# PROMETHEUS RULES
+# DO NOT REMOVE line above, used in `pre-commit` hook
+
 groups:
   - name: team_detection
     rules:
-      - record: uw_namespace_owner
-        expr: label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner)")
-      # used as: `metric_with_namespace * on (namespace) group_left (team) uw_namespace_team
+      # used as: `<metric_with_namespace_label> * on (namespace) group_left
+      # (team) uw_namespace_team`
       - record: uw_namespace_team
-        expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system)"))
+        expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)"))

From b69488544823c8ca864ff952784150e257e79c5d Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 17 Nov 2023 11:44:52 +0100
Subject: [PATCH 08/20] Remove partial changes

---
 common/stock/namespace_sync.yaml.tmpl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl
index f02f2ae..2d484d2 100644
--- a/common/stock/namespace_sync.yaml.tmpl
+++ b/common/stock/namespace_sync.yaml.tmpl
@@ -6,11 +6,9 @@ groups:
     rules:
       - alert: ArgoCDApplicationNotSynced
         expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1
-          #expr: argocd_app_info * on (namespace) group_left(team) sum by (namespace, team) (label_replace(kube_namespace_labels{job!="opencost"}, "team", "$1", "label_uw_systems_owner", "(system|partner)"))
         for: 1h
         labels:
-          alerttype: stock
-          alertgroup: namespace_sync
+          group: namespace_sync
         annotations:
           summary: "Argo CD application {{$labels.name}} is not synced for last 1h"
           impact: "Some manifests won't be automatically deployed."

From f1539fce15ce42544ec2723122dd923af47488cc Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Tue, 21 Nov 2023 17:02:42 +0100
Subject: [PATCH 09/20] Update readme

---
 common/stock/README.md | 77 ++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/common/stock/README.md b/common/stock/README.md
index 7749f9d..4c92e4c 100644
--- a/common/stock/README.md
+++ b/common/stock/README.md
@@ -6,51 +6,78 @@ to consume.
 Stock alerts are already setup and "firing" for all teams, and they only need
 to be consumed by teams, if they choose to.
 
-To consume the stock alerts, add a new route on alertmanager filtering your
-namespaces and pointing to your receiver. The `matchers` clause follows the
-usual prometheus syntax.
+To consume the alerts, there are 2 options:
+* Opt into automatic team detection based on namespace owner (recommended)
+* Create a new alertmanager route to manually consume alerts for whatever
+  namespace you care about
 
-Example (with recommended grouping):
+### Automatic team detection based on `uw.systems/owner` namespace label
+Stock alerts can generate a `team` label with the value of the
+`uw.systems/owner` label of the namespace that the alert belongs to.
+
+To opt-in into this team detection, add your `uw.systems/owner` value to the
+regex at the end of
+https://github.com/utilitywarehouse/system-alerts/blob/main/common/stock/team_detection.yaml#L10.
+Ask in #infra if you need help setting this up.
+
+Now the stock alerts for your namespaces will have a `team` label and be
+captured by your existing router.
+
+### Manual configuration of specific namespaces
+Add a new route on alertmanager filtering your namespaces and pointing to your
+receiver. The `matchers` clause follows the usual prometheus syntax.
+
+Example:
 ```
 route:
+  ...
   routes:
-    - matchers: ['{namespace=~"myteam-.*"}']
+    ...
+    - matchers: ['{alerttype="stock", namespace=~"myteam-.*|alsoimportant"}']
       receiver: myteam-receiver
-      group_by: ["alertname", "namespace", "deployment", "statefulset"]
+    ...
 ```
 
-If you want to opt out of some alerts, you can have a subroute matching what
-you don't want and sending it to the `deadletter` receiver.
+### Note on grouping alerts
+Stock alerts use the default grouping found at the top of the [alertmanager
+config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12)
 
-Example ignoring some alerts:
+If your team alerts need different grouping, you can configure it by adding a
+subroute for your alerts with your custom grouping:
 ```
 route:
+  ...
   routes:
-    - matchers: ['{namespace=~"myteam-.*"}']
+    ...
+    - matchers: ['{team="myteam"}']
       receiver: myteam-receiver
-      group_by: ["alertname", "namespace", "deployment", "statefulset"]
       routes:
-        # Example of ignoring some alerts by sending them to `deadletter`
-        - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}']
-          receiver: deadletter
+        # Example of custom grouping for non-stock alerts
+        - matchers: ['{alerttype!="stock"}']
+          group_by: ["your", "custom", "grouping"]
+    ...
 ```
 
-## Note on `team` label and catching-non-stock alerts
-Matchers filtering only by namespace can also match team's own alerts, which
-could be undesired. If you need different configuration for stock alerts and
-your team dedicated alerts, you need to tweak the filters.
+### Opting out of some alerts
+If you want to opt out of some alerts, you can have a subroute matching what
+you don't want and sending it to the `deadletter` receiver.
 
-Example of different routes for stock and team alerts(assuming team alerts use
-`team` label):
+Example ignoring some alerts:
 ```
 route:
+  ...
   routes:
-    - matchers: ['{namespace=~"myteam-.*", team=""}']
-      receiver: myteam-receiver-for-stock-alerts
-      ...
+    ...
     - matchers: ['{team="myteam"}']
-      receiver: myteam-receiver-for-team-alerts
-      ...
+      receiver: myteam-receiver
+      routes:
+        # Example of ignoring some alerts by sending them to `deadletter`
+        - matchers: ['{alertgroup="storage"}']
+          receiver: deadletter
+        # Example of ignoring some alerts by sending them to `deadletter`
+        - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}']
+          receiver: deadletter
+    ...
 ```
 
 ## Notes for @system

From ac4ca8c0aca6d9fc8a813ecb2f05dfa41e7b36df Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Thu, 23 Nov 2023 11:44:43 +0100
Subject: [PATCH 10/20] Add uw_namespace_oncall_team rule

---
 common/stock/team_detection.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index 544fd21..4c90cec 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -8,3 +8,6 @@ groups:
       # (team) uw_namespace_team`
       - record: uw_namespace_team
         expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)"))
+      # used as: `<metric_with_namespace_label> * on (namespace) group_left (team) uw_namespace_oncall_team`
+      - record: uw_namespace_oncall_team
+        expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)"))

From 186649e96af4fdc07020487846de88eae3a64992 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Thu, 23 Nov 2023 11:52:26 +0100
Subject: [PATCH 11/20] Update alert labels

---
 common/stock/container.yaml.tmpl        |  9 ++++++---
 common/stock/missing_replicas.yaml.tmpl | 18 ++++++++++++------
 common/stock/namespace_sync.yaml.tmpl   |  9 ++++++---
 common/stock/storage.yaml.tmpl          |  6 ++++--
 common/stock/terraform_sync.yaml.tmpl   |  3 ++-
 common/stock/vault-clients.yaml.tmpl    |  6 ++++--
 6 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl
index 5ecc1a3..eac61b7 100644
--- a/common/stock/container.yaml.tmpl
+++ b/common/stock/container.yaml.tmpl
@@ -9,7 +9,8 @@ groups:
       - alert: ContainerRestartingOften
         expr: increase(kube_pod_container_status_restarts_total[2h]) > 3
         labels:
-          group: container
+          alerttype: stock
+          alertgroup: container
         annotations:
           summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has restarted more than 3 times in the last 2h"
           impact: "Container may be crashlooping and not working as expected"
@@ -21,7 +22,8 @@ groups:
         expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95
         for: 15m
         labels:
-          group: container
+          alerttype: stock
+          alertgroup: container
         annotations:
           summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} is being CPU throttled."
           impact: "Container might take longer than normal to respond to requests."
@@ -31,7 +33,8 @@ groups:
         expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) kube_pod_container_status_ready == 0
         for: 5m
         labels:
-          group: container
+          alerttype: stock
+          alertgroup: container
         annotations:
           summary: "Container {{$labels.namespace}}/{{$labels.pod}}/{{$labels.container}} has been OOMKilled recently and it's not ready"
           impact: "Container not ready, may affect service uptime"
diff --git a/common/stock/missing_replicas.yaml.tmpl b/common/stock/missing_replicas.yaml.tmpl
index 7b37bc0..126979f 100644
--- a/common/stock/missing_replicas.yaml.tmpl
+++ b/common/stock/missing_replicas.yaml.tmpl
@@ -8,7 +8,8 @@ groups:
         expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{}
         for: 15m
         labels:
-          group: missing_replicas
+          alerttype: stock
+          alertgroup: missing_replicas
         annotations:
           summary: "Deployment {{$labels.namespace}}/{{$labels.deployment}} has missing replicas for 15m"
           impact: "Workload may be unavailable or have lost high availability"
@@ -18,7 +19,8 @@ groups:
         expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{}
         for: 15m
         labels:
-          group: missing_replicas
+          alerttype: stock
+          alertgroup: missing_replicas
         annotations:
           summary: "Statefulset {{$labels.namespace}}/{{$labels.statefulset}} has missing replicas for 15m"
           impact: "Workload may be unavailable or have lost high availability"
@@ -29,7 +31,8 @@ groups:
         expr: (kube_daemonset_status_number_ready != kube_daemonset_status_desired_number_scheduled) and changes(kube_daemonset_status_updated_number_scheduled[10m]) == 0
         for: 5m
         labels:
-          group: missing_replicas
+          alerttype: stock
+          alertgroup: missing_replicas
         annotations:
           summary: "Daemonset {{$labels.namespace}}/{{$labels.daemonset}} has missing replicas"
           impact: "Workload unavailable on some nodes"
@@ -39,7 +42,8 @@ groups:
         expr: (kube_deployment_status_replicas_available == 0 and kube_deployment_status_replicas != 0) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{}
         for: 5m
         labels:
-          group: missing_replicas
+          alerttype: stock
+          alertgroup: missing_replicas
         annotations:
           summary: "Deployment {{$labels.namespace}}/{{$labels.deployment}} has 0 healthy replicas."
           impact: "Workload is down"
@@ -49,7 +53,8 @@ groups:
         expr: (kube_statefulset_status_replicas_ready == 0 and kube_statefulset_status_replicas != 0) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{}
         for: 5m
         labels:
-          group: missing_replicas
+          alerttype: stock
+          alertgroup: missing_replicas
         annotations:
           summary: "Statefulset {{$labels.namespace}}/{{$labels.statefulset}} has 0 healthy replicas."
           impact: "Workload is down"
@@ -59,7 +64,8 @@ groups:
         expr: (kube_daemonset_status_number_ready == 0 and kube_daemonset_status_desired_number_scheduled != 0)
         for: 5m
         labels:
-          group: missing_replicas
+          alerttype: stock
+          alertgroup: missing_replicas
         annotations:
           summary: "Daemonset {{$labels.namespace}}/{{$labels.daemonset}} has 0 healthy replicas."
           impact: "Workload is down"
diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl
index 2d484d2..7fce46b 100644
--- a/common/stock/namespace_sync.yaml.tmpl
+++ b/common/stock/namespace_sync.yaml.tmpl
@@ -8,7 +8,8 @@ groups:
         expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1
         for: 1h
         labels:
-          group: namespace_sync
+          alerttype: stock
+          alertgroup: namespace_sync
         annotations:
           summary: "Argo CD application {{$labels.name}} is not synced for last 1h"
           impact: "Some manifests won't be automatically deployed."
@@ -33,7 +34,8 @@ groups:
       - alert: ArgoCDApplicationSyncFailure
         expr: increase(argocd_app_sync_total{phase=~"Error|Failed"}[1h]) > 0
         labels:
-          group: namespace_sync
+          alerttype: stock
+          alertgroup: namespace_sync
         annotations:
           summary: "Argo CD application {{$labels.name}} Sync failed"
           impact: "Some manifests won't be automatically deployed."
@@ -43,7 +45,8 @@ groups:
         expr: kube_applier_last_run_success != 1
         for: 1h10m
         labels:
-          group: namespace_sync
+          alerttype: stock
+          alertgroup: namespace_sync
         annotations:
           summary: "kube-applier encountered errors while applying {{ $labels.namespace }}"
           impact: Some manifest won't be automatically deployed.
diff --git a/common/stock/storage.yaml.tmpl b/common/stock/storage.yaml.tmpl
index 656da7b..6371b94 100644
--- a/common/stock/storage.yaml.tmpl
+++ b/common/stock/storage.yaml.tmpl
@@ -8,7 +8,8 @@ groups:
         expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 72 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5
         for: 2h
         labels:
-          group: storage
+          alerttype: stock
+          alertgroup: storage
         annotations:
           summary: "Volume {{$labels.namespace}}/{{$labels.persistentvolumeclaim}} will fill up in 72h"
           impact: "Exhausting available disk space will most likely result in service disruption"
@@ -18,7 +19,8 @@ groups:
         expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 6 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5
         for: 30m
         labels:
-          group: storage
+          alerttype: stock
+          alertgroup: storage
         annotations:
           summary: "Volume {{$labels.namespace}}/{{$labels.persistentvolumeclaim}} will fill up in 6h"
           impact: "Exhausting available disk space will most likely result in service disruption"
diff --git a/common/stock/terraform_sync.yaml.tmpl b/common/stock/terraform_sync.yaml.tmpl
index b26dd4c..4b04fb1 100644
--- a/common/stock/terraform_sync.yaml.tmpl
+++ b/common/stock/terraform_sync.yaml.tmpl
@@ -8,7 +8,8 @@ groups:
         expr: terraform_applier_module_last_run_success == 0
         for: 1h10m
         labels:
-          group: terraform_sync
+          alerttype: stock
+          alertgroup: terraform_sync
         annotations:
           summary: "Terraform module {{ $labels.module }} in {{ $labels.namespace }} encountered errors during last terraform run"
           description: |
diff --git a/common/stock/vault-clients.yaml.tmpl b/common/stock/vault-clients.yaml.tmpl
index 9ae29f8..ad8d1f2 100644
--- a/common/stock/vault-clients.yaml.tmpl
+++ b/common/stock/vault-clients.yaml.tmpl
@@ -9,7 +9,8 @@ groups:
         expr: time() - vkcc_sidecar_expiry_timestamp_seconds > 0
         for: 10m
         labels:
-          group: vault_clients
+          alerttype: stock
+          alertgroup: vault_clients
         annotations:
           description: |
             The credentials served by the vault credentials agent sidecar have expired and have not
@@ -20,7 +21,8 @@ groups:
         expr: (kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) kube_pod_status_scheduled{condition="true"} == 1) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"}
         for: 10m
         labels:
-          group: vault_clients
+          alerttype: stock
+          alertgroup: vault_clients
         annotations:
           description: |
             The pod is annotated with `{{ $labels.key }}={{ $labels.value }}` but does not have a

From 2f79d6e0d220f67a410b412eda2d196fd19ee460 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Thu, 23 Nov 2023 12:25:52 +0100
Subject: [PATCH 12/20] Update container alerts with team detection

---
 common/stock/container.yaml.tmpl | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl
index eac61b7..5d355be 100644
--- a/common/stock/container.yaml.tmpl
+++ b/common/stock/container.yaml.tmpl
@@ -7,7 +7,16 @@ groups:
       # Set period to 2h to capture slow crashing containers like
       # thanos-compact that take a long time to start up
       - alert: ContainerRestartingOften
+<<<<<<< HEAD
         expr: increase(kube_pod_container_status_restarts_total[2h]) > 3
+=======
+        expr: |
+          (
+            increase(kube_pod_container_status_restarts_total[10m]) > 3
+          )
+          * on (namespace) group_left(team) uw_namespace_oncall_team
+        keep_firing_for: 10m
+>>>>>>> f37f257 (Update container alerts with team detection)
         labels:
           alerttype: stock
           alertgroup: container
@@ -19,7 +28,20 @@ groups:
           logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"
       # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867
       - alert: ContainerCpuThrottled
+<<<<<<< HEAD
         expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95
+=======
+        # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867
+        expr: |
+          (
+            (
+              sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace)
+              /
+              sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)
+            ) > 0.95
+          )
+          * on (namespace) group_left(team) uw_namespace_oncall_team
+>>>>>>> f37f257 (Update container alerts with team detection)
         for: 15m
         labels:
           alerttype: stock
@@ -30,7 +52,13 @@ groups:
           action: "Investigate CPU consumption and adjust pods resources if needed."
           dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/VAE0wIcik/kubernetes-pod-resources?orgId=1&refresh=1m&from=now-12h&to=now&var-instance=All&var-namespace={{ $labels.namespace }}"
       - alert: ContainerOOMing
-        expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) kube_pod_container_status_ready == 0
+        expr: |
+          (
+            kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}
+            and on (container,pod) 
+            (kube_pod_container_status_ready == 0)
+          )
+          * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 5m
         labels:
           alerttype: stock

From 49947865eb5f3e9bd7a9a81ccad5459b24ee8ba4 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Thu, 23 Nov 2023 14:46:21 +0100
Subject: [PATCH 13/20] Fix uw_namespace_oncall_team

---
 common/stock/team_detection.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index 4c90cec..4a78594 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -10,4 +10,6 @@ groups:
         expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)"))
       # used as: `<metric_with_namespace_label> * on (namespace) group_left (team) uw_namespace_oncall_team`
       - record: uw_namespace_oncall_team
-        expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)"))
+        expr: sum by (namespace, team) (label_replace(kube_namespace_annotations{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)"))
+          #- record: uw_namespace_oncall_team
+          #  expr: sum by (namespace, team) (label_replace(kube_namespace_annotations, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)"))

From 18bb084b1bf09a9111135ded07745ce37fe57f58 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 11:04:19 +0100
Subject: [PATCH 14/20] Update readme

---
 common/stock/README.md | 92 +++++++++---------------------------------
 1 file changed, 19 insertions(+), 73 deletions(-)

diff --git a/common/stock/README.md b/common/stock/README.md
index 4c92e4c..695b4bd 100644
--- a/common/stock/README.md
+++ b/common/stock/README.md
@@ -3,42 +3,38 @@ Common and useful alerts maintained by @system and readily available for teams
 to consume.
 
 ## Usage
-Stock alerts are already setup and "firing" for all teams, and they only need
-to be consumed by teams, if they choose to.
+Stock alerts are already setup and "firing" for all namespaces. Teams only need
+to claim namespace oncall responsibility to receive them.
 
-To consume the alerts, there are 2 options:
-* Opt into automatic team detection based on namespace owner (recommended)
-* Create a new alertmanager route to manually consume alerts for whatever
-  namespace you care about
+To do so, add a `uw.systems/oncall-team` annotation to namespaces to claim
+them, and the team will receive the stock alerts for those namespaces
+automatically.
 
-### Automatic team detection based on `uw.systems/owner` namespace label
-Stock alerts can generate a `team` label with the value of the
-`uw.systems/owner` label of the namespace that the alert belongs to.
+Namespace definitions live in `kubernetes-manifests/<cluster>/kube-system/namespaces.yaml`
 
-To opt-in into this team detection, add your `uw.systems/owner` value to the
-regex at the end of
-https://github.com/utilitywarehouse/system-alerts/blob/main/common/stock/team_detection.yaml#L10.
-Ask in #infra if you need help setting this up.
-
-Now the stock alerts for your namespaces will have a `team` label and be
-captured by your existing router.
-
-### Manual configuration of specific namespaces
-Add a new route on alertmanager filtering your namespaces and pointing to your
-receiver. The `matchers` clause follows the usual prometheus syntax.
+### Opting out of some alerts
+If you want to opt out of some alerts, you can have a subroute matching what
+you don't want and sending it to the `deadletter` receiver.
 
-Example:
+Example ignoring some alerts:
 ```
 route:
   ...
   routes:
     ...
-    - matchers: ['{alerttype="stock", namespace=~"myteam-.*|alsoimportant"}']
+    - matchers: ['{team="myteam"}']
       receiver: myteam-receiver
+      routes:
+        # Example of ignoring some alerts by sending them to `deadletter`
+        - matchers: ['{alertgroup="storage"}']
+          receiver: deadletter
+        # Example of ignoring some alerts by sending them to `deadletter`
+        - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}']
+          receiver: deadletter
     ...
 ```
 
-### Note on grouping alerts
+### Note on alert grouping
 Stock alerts use the default grouping found at the top of the [alertmanager
 config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12)
 
@@ -57,53 +53,3 @@ route:
           group_by: ["your", "custom", "grouping"]
     ...
 ```
-
-### Opting out of some alerts
-If you want to opt out of some alerts, you can have a subroute matching what
-you don't want and sending it to the `deadletter` receiver.
-
-Example ignoring some alerts:
-```
-route:
-  ...
-  routes:
-    ...
-    - matchers: ['{team="myteam"}']
-      receiver: myteam-receiver
-      routes:
-        # Example of ignoring some alerts by sending them to `deadletter`
-        - matchers: ['{alertgroup="storage"}']
-          receiver: deadletter
-        # Example of ignoring some alerts by sending them to `deadletter`
-        - matchers: ['{alertname="StatefulSetMissingReplicas",statefulset="kafka"}']
-          receiver: deadletter
-    ...
-```
-
-## Notes for @system
-
-### Note on `namespace` vs `kubernetes_namespace` labels
-There are two possible "namespace" labels in metrics
-* `namespace` is the namespace labeled by a metric inside it's exporter. It is
-  relevant in metrics exposed by workloads that are aware of kubernetes
-  namespace as a concept, like metrics coming from argocd or
-  kube-state-metrics.
-* `kubernetes_namespace` is the namespace where the metric was scraped from.
-  This is relevant in metrics exposed by workloads that do not deal with
-  kubernetes namespaces as a concept.
-
-Currently all the metrics used in the stock alerts rely only on `namespace` label, but this may change in the future. We could either add a second matcher or relabel `kubernetes_namespace` to `namespace` in all metrics where `namespace` is not set.
-
-### Note for @system team configuration
-There are metrics that have a @system `kubernetes_namespace` but a non-@system
-`namespace`, like an argocd metric that comes from a @system namespace but is
-talking about another team's namespace (`argocd_app_info{sync_status!="Synced",
-kubernetes_namespace="sys-argo-cd", namespace="billing"} 1`).
-
-However, there are no metrics that have a @system `namespace` but a non-@system
-`kubernetes_namespace`. This can be verified by running `group by (__name__)
-({namespace=~"kube-system|sys-.*",
-kubernetes_namespace!~"|kube-system|sys-.*"})` and getting no results.
-
-For this reasons, @system needs to be more careful with it's matchers and ensure
-it's not catching team's metrics by accident.

From 09f846f4424ac3c6729e0d7e99b4cef0df760d74 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 11:06:49 +0100
Subject: [PATCH 15/20] Update container alerts

---
 common/stock/container.yaml.tmpl | 34 +++-----------------------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/common/stock/container.yaml.tmpl b/common/stock/container.yaml.tmpl
index 5d355be..e616458 100644
--- a/common/stock/container.yaml.tmpl
+++ b/common/stock/container.yaml.tmpl
@@ -7,16 +7,7 @@ groups:
       # Set period to 2h to capture slow crashing containers like
       # thanos-compact that take a long time to start up
       - alert: ContainerRestartingOften
-<<<<<<< HEAD
-        expr: increase(kube_pod_container_status_restarts_total[2h]) > 3
-=======
-        expr: |
-          (
-            increase(kube_pod_container_status_restarts_total[10m]) > 3
-          )
-          * on (namespace) group_left(team) uw_namespace_oncall_team
-        keep_firing_for: 10m
->>>>>>> f37f257 (Update container alerts with team detection)
+        expr: (increase(kube_pod_container_status_restarts_total[2h]) > 3) * on (namespace) group_left(team) uw_namespace_oncall_team
         labels:
           alerttype: stock
           alertgroup: container
@@ -28,20 +19,7 @@ groups:
           logs: "https://grafana.$ENVIRONMENT.aws.uw.systems/explore?orgId=1&left=%7B%22datasource%22%3A%22P8E80F9AEF21F6940%22%2C%22queries%22%3A%5B%7B%22refId%22%3A%22A%22%2C%22expr%22%3A%22%7Bkubernetes_cluster%3D%5C%22$ENVIRONMENT-$PROVIDER%5C%22%2C+kubernetes_namespace%3D%5C%22{{ $labels.namespace }}%5C%22%2C+app_kubernetes_io_name%3D%5C%{{ $labels.label_app_kubernetes_io_name }}%5C%22%7D%22%2C%22queryType%22%3A%22range%22%2C%22datasource%22%3A%7B%22type%22%3A%22loki%22%2C%22uid%22%3A%22P8E80F9AEF21F6940%22%7D%2C%22editorMode%22%3A%22code%22%7D%5D%2C%22range%22%3A%7B%22from%22%3A%22now-1h%22%2C%22to%22%3A%22now%22%7D%7D"
       # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867
       - alert: ContainerCpuThrottled
-<<<<<<< HEAD
-        expr: sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0.95
-=======
-        # https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/108#issuecomment-432796867
-        expr: |
-          (
-            (
-              sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace)
-              /
-              sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)
-            ) > 0.95
-          )
-          * on (namespace) group_left(team) uw_namespace_oncall_team
->>>>>>> f37f257 (Update container alerts with team detection)
+        expr: ((sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)) > 0.95) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 15m
         labels:
           alerttype: stock
@@ -52,13 +30,7 @@ groups:
           action: "Investigate CPU consumption and adjust pods resources if needed."
           dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/VAE0wIcik/kubernetes-pod-resources?orgId=1&refresh=1m&from=now-12h&to=now&var-instance=All&var-namespace={{ $labels.namespace }}"
       - alert: ContainerOOMing
-        expr: |
-          (
-            kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}
-            and on (container,pod) 
-            (kube_pod_container_status_ready == 0)
-          )
-          * on (namespace) group_left(team) uw_namespace_oncall_team
+        expr: (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} and on (container,pod) (kube_pod_container_status_ready == 0)) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 5m
         labels:
           alerttype: stock

From 4d70942a490c31aa96adf98c6589b3f8b11a0b3b Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 11:20:49 +0100
Subject: [PATCH 16/20] Update missing_replias alerts

---
 common/stock/missing_replicas.yaml.tmpl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/stock/missing_replicas.yaml.tmpl b/common/stock/missing_replicas.yaml.tmpl
index 126979f..a1bf45b 100644
--- a/common/stock/missing_replicas.yaml.tmpl
+++ b/common/stock/missing_replicas.yaml.tmpl
@@ -5,7 +5,7 @@ groups:
   - name: MissingReplicas
     rules:
       - alert: DeploymentMissingReplicas
-        expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{}
+        expr: (kube_deployment_spec_replicas != kube_deployment_status_replicas_available) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 15m
         labels:
           alerttype: stock
@@ -16,7 +16,7 @@ groups:
           action: "Check why some replicas are not healthy"
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe deployment {{ $labels.deployment }}"
       - alert: StatefulsetMissingReplicas
-        expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{}
+        expr: (kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 15m
         labels:
           alerttype: stock
@@ -28,7 +28,7 @@ groups:
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe statefulset {{ $labels.statefulset }}"
       - alert: DaemonsetMissingReplicas
         # Alert if there are unhealthy replicas and the ds is not updating it's replicas
-        expr: (kube_daemonset_status_number_ready != kube_daemonset_status_desired_number_scheduled) and changes(kube_daemonset_status_updated_number_scheduled[10m]) == 0
+        expr: ((kube_daemonset_status_number_ready != kube_daemonset_status_desired_number_scheduled) and (changes(kube_daemonset_status_updated_number_scheduled[10m]) == 0)) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 5m
         labels:
           alerttype: stock
@@ -39,7 +39,7 @@ groups:
           action: "Check why some replicas are not healthy"
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe daemonset {{ $labels.daemonset }}"
       - alert: DeploymentMissingAllReplicas
-        expr: (kube_deployment_status_replicas_available == 0 and kube_deployment_status_replicas != 0) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations{}
+        expr: (kube_deployment_status_replicas_available == 0 and kube_deployment_status_replicas != 0) * ON (deployment, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_deployment_annotations * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 5m
         labels:
           alerttype: stock
@@ -50,7 +50,7 @@ groups:
           action: "Check why all replicas are missing"
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe deployment {{ $labels.deployment }}"
       - alert: StatefulsetMissingAllReplicas
-        expr: (kube_statefulset_status_replicas_ready == 0 and kube_statefulset_status_replicas != 0) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations{}
+        expr: (kube_statefulset_status_replicas_ready == 0 and kube_statefulset_status_replicas != 0) * ON (statefulset, namespace) group_left(annotation_app_uw_systems_tier, annotation_app_uw_systems_system, annotation_app_uw_systems_owner) kube_statefulset_annotations * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 5m
         labels:
           alerttype: stock
@@ -61,7 +61,7 @@ groups:
           action: "Check why all replicas are missing"
           command: "kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.namespace }} describe statefulset {{ $labels.statefulset }}"
       - alert: DaemonsetMissingAllReplicas
-        expr: (kube_daemonset_status_number_ready == 0 and kube_daemonset_status_desired_number_scheduled != 0)
+        expr: (kube_daemonset_status_number_ready == 0 and kube_daemonset_status_desired_number_scheduled != 0) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 5m
         labels:
           alerttype: stock

From 2bf0e765d5e3ac871f896042570bdf7d4ae35d39 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 11:26:33 +0100
Subject: [PATCH 17/20] Update more alerts

---
 common/stock/namespace_sync.yaml.tmpl | 8 ++++----
 common/stock/storage.yaml.tmpl        | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/stock/namespace_sync.yaml.tmpl b/common/stock/namespace_sync.yaml.tmpl
index 7fce46b..7f37525 100644
--- a/common/stock/namespace_sync.yaml.tmpl
+++ b/common/stock/namespace_sync.yaml.tmpl
@@ -5,7 +5,7 @@ groups:
   - name: NamespaceSync
     rules:
       - alert: ArgoCDApplicationNotSynced
-        expr: argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1
+        expr: (argocd_app_info{autosync_enabled="true",sync_status!="Synced"} == 1) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 1h
         labels:
           alerttype: stock
@@ -16,7 +16,7 @@ groups:
           action: "Check the web UI / logs for errors."
           link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}}
       - alert: ArgoCDApplicationAutoSyncDisabled
-        expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_team
+        expr: (argocd_app_info{autosync_enabled="false"} == 1) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 1h
         labels:
           alerttype: stock
@@ -32,7 +32,7 @@ groups:
             `kubectl --context {{$labels.kubernetes_cluster}} -n {{$labels.namespace}} patch --type='merge' applications.argoproj.io {{$labels.name}} -p "{\"spec\":{\"syncPolicy\":{\"automated\":{\"prune\":false,\"selfHeal\":false}}}}"`
           link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}}
       - alert: ArgoCDApplicationSyncFailure
-        expr: increase(argocd_app_sync_total{phase=~"Error|Failed"}[1h]) > 0
+        expr: (increase(argocd_app_sync_total{phase=~"Error|Failed"}[1h]) > 0) * on (namespace) group_left(team) uw_namespace_oncall_team
         labels:
           alerttype: stock
           alertgroup: namespace_sync
@@ -42,7 +42,7 @@ groups:
           action: "Check the web UI / logs for errors."
           link: https://argocd-system.$ENVIRONMENT.$PROVIDER.uw.systems/applications/{{$labels.namespace}}/{{$labels.name}}
       - alert: KubeApplierErrors
-        expr: kube_applier_last_run_success != 1
+        expr: (kube_applier_last_run_success != 1) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 1h10m
         labels:
           alerttype: stock
diff --git a/common/stock/storage.yaml.tmpl b/common/stock/storage.yaml.tmpl
index 6371b94..f0bcb7f 100644
--- a/common/stock/storage.yaml.tmpl
+++ b/common/stock/storage.yaml.tmpl
@@ -5,7 +5,7 @@ groups:
   - name: Storage
     rules:
       - alert: VolumeFillingUpin72h
-        expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 72 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5
+        expr: ((predict_linear(kubelet_volume_stats_available_bytes[1h], 72 * 3600) < 0) and (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5)) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 2h
         labels:
           alerttype: stock
@@ -16,7 +16,7 @@ groups:
           action: "Investigate disk usage and adjust volume size if necessary."
           dashboard: "https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/919b92a8e8041bd567af9edab12c840c/kubernetes-persistent-volumes?orgId=1&refresh=10s&var-datasource=default&var-cluster=${ENVIRONMENT}-${PROVIDER}&var-namespace={{ $labels.namespace }}&var-volume={{ $labels.persistentvolumeclaim }}"
       - alert: VolumeFillingUpin6h
-        expr: predict_linear(kubelet_volume_stats_available_bytes[1h], 6 * 3600) < 0 and kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5
+        expr: ((predict_linear(kubelet_volume_stats_available_bytes[1h], 6 * 3600) < 0) and (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.5)) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 30m
         labels:
           alerttype: stock

From 0f402d08950598bc61fb34e8133a36d33157a971 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 11:26:45 +0100
Subject: [PATCH 18/20] Drop unnecesary rule

---
 common/stock/team_detection.yaml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/common/stock/team_detection.yaml b/common/stock/team_detection.yaml
index 4a78594..e17db67 100644
--- a/common/stock/team_detection.yaml
+++ b/common/stock/team_detection.yaml
@@ -4,12 +4,7 @@
 groups:
   - name: team_detection
     rules:
-      # used as: `<metric_with_namespace_label> * on (namespace) group_left
-      # (team) uw_namespace_team`
-      - record: uw_namespace_team
-        expr: sum by (namespace, team) (label_replace(kube_namespace_labels{job="kube-state-metrics"}, "team", "$1", "label_uw_systems_owner", "(system|partner|telco)"))
       # used as: `<metric_with_namespace_label> * on (namespace) group_left (team) uw_namespace_oncall_team`
       - record: uw_namespace_oncall_team
+        # job filtering is needed to avoid duplicated `kube_namespace_annotations` from `opencost` job
         expr: sum by (namespace, team) (label_replace(kube_namespace_annotations{job="kube-state-metrics"}, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)"))
-          #- record: uw_namespace_oncall_team
-          #  expr: sum by (namespace, team) (label_replace(kube_namespace_annotations, "team", "$1", "annotation_uw_systems_oncall_team", "(.*)"))

From 15e8987bc0f20bbfbc9357d78e957ea4aba90a15 Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 11:28:52 +0100
Subject: [PATCH 19/20] Update more alerts

---
 common/stock/terraform_sync.yaml.tmpl | 3 ++-
 common/stock/vault-clients.yaml.tmpl  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/common/stock/terraform_sync.yaml.tmpl b/common/stock/terraform_sync.yaml.tmpl
index 4b04fb1..307da13 100644
--- a/common/stock/terraform_sync.yaml.tmpl
+++ b/common/stock/terraform_sync.yaml.tmpl
@@ -5,7 +5,8 @@ groups:
   - name: TerraformSync
     rules:
       - alert: TerraformApplierErrors
-        expr: terraform_applier_module_last_run_success == 0
+        expr: (terraform_applier_module_last_run_success == 0) * on (namespace) group_left(team) uw_namespace_oncall_team
+
         for: 1h10m
         labels:
           alerttype: stock
diff --git a/common/stock/vault-clients.yaml.tmpl b/common/stock/vault-clients.yaml.tmpl
index ad8d1f2..cf69421 100644
--- a/common/stock/vault-clients.yaml.tmpl
+++ b/common/stock/vault-clients.yaml.tmpl
@@ -6,7 +6,7 @@ groups:
     # Recommendations from https://s3-us-west-2.amazonaws.com/hashicorp-education/whitepapers/Vault/Vault-Consul-Monitoring-Guide.pdf
     rules:
       - alert: VaultSidecarCredentialsExpired
-        expr: time() - vkcc_sidecar_expiry_timestamp_seconds > 0
+        expr: (time() - vkcc_sidecar_expiry_timestamp_seconds > 0) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 10m
         labels:
           alerttype: stock
@@ -18,7 +18,7 @@ groups:
           summary: "The credentials for '{{ $labels.kubernetes_pod_name }}' have expired"
           dashboard: https://grafana.$ENVIRONMENT.$PROVIDER.uw.systems/d/U61wpstMk/vault-credentials-sidecars
       - alert: VaultSidecarMissing
-        expr: (kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) kube_pod_status_scheduled{condition="true"} == 1) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"}
+        expr: ((kube_pod_annotations{annotation_injector_tumblr_com_request=~"vault-sidecar-.+"} and on (pod,namespace) (kube_pod_status_scheduled{condition="true"} == 1)) unless on (pod,namespace) kube_pod_container_info{container=~"vault-credentials-agent.*"}) * on (namespace) group_left(team) uw_namespace_oncall_team
         for: 10m
         labels:
           alerttype: stock

From 710be6350a5bf1008b0e3646180babf4a936170d Mon Sep 17 00:00:00 2001
From: Hector Huertas <hectorhuertas@gmail.com>
Date: Fri, 24 Nov 2023 12:04:05 +0100
Subject: [PATCH 20/20] Update docs about alert grouping

---
 common/stock/README.md | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/common/stock/README.md b/common/stock/README.md
index 695b4bd..02b330a 100644
--- a/common/stock/README.md
+++ b/common/stock/README.md
@@ -35,11 +35,9 @@ route:
 ```
 
 ### Note on alert grouping
-Stock alerts use the default grouping found at the top of the [alertmanager
-config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12)
-
-If your team alerts need different grouping, you can configure it by adding a
-subroute for your alerts with your custom grouping:
+If your team is using a custom grouping that is missing entries from the
+default grouping (set at the top of the [alertmanager
+config](https://github.com/utilitywarehouse/kubernetes-manifests/blob/master/prod-aws/sys-mon/resources/alertmanager-config-template.yaml#L12)), it is suggested to configure stock alerts to use the stock grouping:
 ```
 route:
   ...
@@ -48,8 +46,8 @@ route:
     - matchers: ['{team="myteam"}']
       receiver: myteam-receiver
       routes:
-        # Example of custom grouping for non-stock alerts
-        - matchers: ['{alerttype!="stock"}']
-          group_by: ["your", "custom", "grouping"]
+        # Example of specifying grouping for stock alerts, using the yaml alias
+        - matchers: ['{alerttype="stock"}']
+          group_by: *stock_grouping
     ...
 ```