From 339029366d66fe1b285752cd21ee998a7a18fb96 Mon Sep 17 00:00:00 2001 From: Ashok Siyani Date: Wed, 15 Jan 2025 13:33:26 +0000 Subject: [PATCH] fix yaml rule templates --- capacity-experiments/capacity-experiments.yaml.tmpl | 2 +- common/stock/terraform_sync.yaml.tmpl | 2 +- common/terraform-applier.yaml.tmpl | 4 ++-- common/thanos.yaml.tmpl | 12 ++++++------ 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/capacity-experiments/capacity-experiments.yaml.tmpl b/capacity-experiments/capacity-experiments.yaml.tmpl index 9dc12f6..75280d7 100644 --- a/capacity-experiments/capacity-experiments.yaml.tmpl +++ b/capacity-experiments/capacity-experiments.yaml.tmpl @@ -11,7 +11,7 @@ groups: team: infra annotations: summary: "AZ {{ $labels.zone}} is running out of memory for pods" - dashboard: + dashboard: - alert: AvailabilityZoneRunningOutOfMemory99for10m expr: avg(node_memory_working_set_bytes/on(node)(kube_node_status_capacity{resource="memory"} - on (node) node_eviction_threshold) * on(node) group_left(zone) kube_node_labels{role="worker"}) by (zone) > 0.99 for: 10m diff --git a/common/stock/terraform_sync.yaml.tmpl b/common/stock/terraform_sync.yaml.tmpl index f23cdbf..3718e05 100644 --- a/common/stock/terraform_sync.yaml.tmpl +++ b/common/stock/terraform_sync.yaml.tmpl @@ -22,6 +22,6 @@ groups: If module is using kube backend and the state is locked you can remove the lock with the following command: `kubectl --context={{ $labels.kubernetes_cluster }} -n {{ $labels.namespace }} patch lease lock-tfstate-default-{{ $labels.module }} --type=json -p='[{"op":"remove","path":"/spec/holderIdentity"}]'` - dashboard: + dashboard: logs: 'https://grafana.$ENVIRONMENT.aws.uw.systems/explore?left=["now-1h","now","Loki",{"expr":"{kubernetes_cluster=\"{{$labels.kubernetes_cluster}}\",kubernetes_namespace=\"{{$labels.kubernetes_namespace}}\"} |=\"{{$labels.module}}\""}]' tf_applier: "https://terraform-applier-system.$ENVIRONMENT.$PROVIDER.uw.systems/" diff --git a/common/terraform-applier.yaml.tmpl b/common/terraform-applier.yaml.tmpl index 9d0156e..7df7f66 100644 --- a/common/terraform-applier.yaml.tmpl +++ b/common/terraform-applier.yaml.tmpl @@ -17,7 +17,7 @@ groups: Please also collect `goroutine` info before restart for debugging the issue. `https://terraform-applier-system.$ENVIRONMENT.$PROVIDER.uw.systems/debug/pprof/goroutine?debug=1` command: "`kubectl --context $ENVIRONMENT-$PROVIDER --namespace {{ $labels.kubernetes_namespace }} rollout restart sts {{ $labels.kubernetes_name }}`" - dashboard: + dashboard: logs: - alert: TerraformApplierGitMirrorError expr: time() - max by (repo) (terraform_applier_git_last_mirror_timestamp{}) > 600 @@ -27,5 +27,5 @@ groups: annotations: summary: "terraform-applier has not been able to fetch {{ $labels.repo }} repository in the last 10m" impact: "terraform-applier will not be running modules from this repository" - dashboard: + dashboard: logs: diff --git a/common/thanos.yaml.tmpl b/common/thanos.yaml.tmpl index 9996805..2307d60 100644 --- a/common/thanos.yaml.tmpl +++ b/common/thanos.yaml.tmpl @@ -24,7 +24,7 @@ groups: team: infra annotations: summary: "Thanos Rule {{$labels.kubernetes_namespace}}/{{$labels.kubernetes_pod_name}} is failing to queue alerts" - dashboard: + dashboard: logs: - alert: ThanosRuleSenderIsFailingAlerts expr: sum by (kubernetes_cluster,kubernetes_namespace, kubernetes_pod_name) (rate(thanos_alert_sender_alerts_dropped_total{}[5m])) > 0 @@ -33,7 +33,7 @@ groups: team: infra annotations: summary: "Thanos Rule {{$labels.kubernetes_namespace}}/{{$labels.kubernetes_pod_name}} is failing to send alerts to alertmanager." - dashboard: + dashboard: logs: - alert: ThanosNoRuleEvaluations expr: | @@ -45,7 +45,7 @@ groups: team: infra annotations: summary: "Thanos Rule {{$labels.kubernetes_namespace}}/{{$labels.kubernetes_pod_name}} did not perform any rule evaluations in the past 10 minutes." - dashboard: + dashboard: logs: - alert: ThanosRuleEvaluationLatencyHigh expr: | @@ -60,7 +60,7 @@ groups: annotations: summary: "Thanos rule {{$labels.kubernetes_namespace}}/{{$labels.kubernetes_pod_name}} has higher evaluation latency than interval for more then 10 group rules" impact: "Slow evaluation can result in missed evaluations" - dashboard: + dashboard: logs: - alert: ThanosRuleHighRuleEvaluationFailures expr: | @@ -75,7 +75,7 @@ groups: team: infra annotations: summary: "Thanos Rule {{$labels.kubernetes_namespace}}/{{$labels.kubernetes_pod_name}} is failing to evaluate more then 10 group rules." - dashboard: + dashboard: logs: - alert: ThanosRuleNoEvaluationFor10Intervals expr: | @@ -89,7 +89,7 @@ groups: summary: Thanos Rule {{$labels.kubernetes_namespace}}/{{$labels.kubernetes_name}} has rule groups that did not evaluate for 10 intervals. description: The rule group {{$labels.rule_group}} did not evaluate for at least 10x of their expected interval. impact: "Alerts are not evaluated hence they wont be fired even if conditions are met" - dashboard: + dashboard: logs: - alert: ThanosBucketOperationsFailing expr: |