cloud_run_alerts: add flexible json-based log indicators and policies

andriipetruk · andriipetruk · commit 6fa61afa41a0 · 2025-03-31T15:56:05.000-07:00
diff --git a/modules/alerts_cloud_run/README.md b/modules/alerts_cloud_run/README.md
@@ -43,15 +43,39 @@ module "cloud_run_service_alerts" {
   }
 
   enable_log_based_text_indicators = true
-  log_based_text_indicators = {
-    "scaling-failure" = {
-      log_name_suffix      = "requests"
-      severity             = "ERROR"
-      text_payload_message = "The request was aborted because there was no available instance."
+  log_based_json_indicators = {
+    "email-bounce-failure" = {
+      log_name_suffix    = "stdout"
+      severity           = "ERROR"
+      additional_filters = "jsonPayload.message=\"bounce error\" AND jsonPayload.method=\"POST\""
+
       condition_threshold = {
-        window    = 10 * local.minute
+        window    = 600  # 10 minutes
+        threshold = 0
+      }
+    }
+
+    "payment-processing-error" = {
+      log_name_suffix    = "stderr"
+      severity           = "ERROR"
+      additional_filters = "jsonPayload.error_code>=500"
+
+      condition_threshold = {
+        window    = 300  # 5 minutes
+        threshold = 1
+      }
+      json_payload_labels = ["transaction_id", "error_code"]
+    }
+  }
+
+  log_based_json_alert_policies = {
+    "email-and-payment-errors" = {
+      metric_keys = ["email-bounce-failure", "payment-processing-error"]
+      condition_threshold = {
+        window    = 600
         threshold = 1
       }
+      runbook_url = "https://github.com/org/repo/blob/main/docs/playbooks/json_alerts.md"
     }
   }
 
diff --git a/modules/alerts_cloud_run/alerts.tf b/modules/alerts_cloud_run/alerts.tf
@@ -30,6 +30,9 @@ locals {
   minute = 60 * local.second
   hour   = 60 * local.minute
   day    = 24 * local.hour
+
+
+  create_json_logging_alert_policy = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators
 }
 
 # Common forward progress #
@@ -296,6 +299,7 @@ resource "google_logging_metric" "json_payload_logging_metric" {
   metric_descriptor {
     metric_kind = "DELTA"
     value_type  = "INT64"
+
     labels {
       key         = "location"
       value_type  = "STRING"
@@ -306,44 +310,61 @@ resource "google_logging_metric" "json_payload_logging_metric" {
       value_type  = "STRING"
       description = "name of service"
     }
-  }
 
-  label_extractors = {
-    "location"     = "EXTRACT(resource.labels.location)"
-    "service_name" = "EXTRACT(resource.labels.service_name)"
+    dynamic "labels" {
+      for_each = each.value.json_payload_labels
+      content {
+        key         = labels.value
+        value_type  = "STRING"
+        description = "Extracted from jsonPayload: ${labels.value}"
+      }
+    }
   }
+
+  label_extractors = merge(
+    {
+      "location"     = "EXTRACT(resource.labels.location)"
+      "service_name" = "EXTRACT(resource.labels.service_name)"
+    },
+    {
+      for label in each.value.json_payload_labels :
+      label => "EXTRACT(jsonPayload.${label})"
+    }
+  )
 }
 
 resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
-  count = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators ? 1 : 0
+  for_each = local.create_json_logging_alert_policy ? var.log_based_json_alert_policies : {}
 
-  project = var.project_id
-
-  display_name = "LogBasedJSON-${local.resource_value}"
+  project      = var.project_id
+  display_name = "${each.key}-${local.resource_value}"
   severity     = "ERROR"
   combiner     = "OR"
 
   dynamic "conditions" {
-    for_each = var.log_based_json_indicators
+    for_each = each.value.metric_keys
 
     content {
-      display_name = "${conditions.key} logging high"
+      display_name = "${conditions.value}-logging-high"
 
       condition_threshold {
         filter = <<-EOT
-            metric.type="${local.user_metric_root_prefix}/${local.resource_value}-${conditions.key}"
-            resource.type="${local.resource_type}"
-          EOT
+          metric.type="logging.googleapis.com/user/${local.resource_value}-${conditions.value}"
+          resource.type="${local.resource_type}"
+        EOT
 
-        duration        = "${conditions.value.condition_threshold.window}s"
+        duration        = "${each.value.condition_threshold.window}s"
         comparison      = "COMPARISON_GT"
-        threshold_value = conditions.value.condition_threshold.threshold
+        threshold_value = each.value.condition_threshold.threshold
 
         aggregations {
           alignment_period     = "60s"
           per_series_aligner   = "ALIGN_SUM"
           cross_series_reducer = "REDUCE_SUM"
-          group_by_fields      = distinct(concat(local.default_group_by_fields, conditions.value.additional_group_by_fields != null ? conditions.value.additional_group_by_fields : []))
+          group_by_fields = distinct(concat(
+            ["resource.label.location"],
+            each.value.additional_group_by_fields != null ? each.value.additional_group_by_fields : []
+          ))
         }
 
         trigger {
@@ -355,28 +376,25 @@ resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
 
   alert_strategy {
     auto_close = "${local.day}s"
-
     notification_channel_strategy {
       renotify_interval = "${local.day}s"
     }
   }
 
   dynamic "documentation" {
-    for_each = var.runbook_urls.json_based_logs != null ? [1] : []
+    for_each = each.value.runbook_url != null ? [1] : []
     content {
-      content   = var.runbook_urls.json_based_logs
+      content   = each.value.runbook_url
       mime_type = "text/markdown"
     }
   }
 
   notification_channels = var.notification_channels_non_paging
 
-  depends_on = [
-    google_logging_metric.json_payload_logging_metric
-  ]
+  depends_on = [google_logging_metric.json_payload_logging_metric]
 }
 
-# CR service specific # 
+# CR service specific #
 
 resource "google_monitoring_alert_policy" "service_4xx_alert_policy" {
   count = !local.is_job && var.service_4xx_configuration.enabled ? 1 : 0
@@ -565,7 +583,7 @@ resource "google_monitoring_alert_policy" "service_max_conns_alert_policy" {
         metric.type="${local.metric_root_prefix}/container/max_request_concurrencies"
         resource.type="${local.resource_type}"
         resource.label.${local.resource_label}="${local.resource_value}"
-        ${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}        
+        ${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}
       EOT
 
       duration        = "${var.service_max_conns_configuration.window}s"
@@ -624,7 +642,7 @@ resource "google_monitoring_alert_policy" "job_failure_alert_policy" {
         metric.label.result="failed"
         resource.type="${local.resource_type}"
         resource.label.${local.resource_label}="${local.resource_value}"
-        ${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}        
+        ${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}
       EOT
 
       duration        = "${var.job_failure_configuration.window}s"
diff --git a/modules/alerts_cloud_run/variables.tf b/modules/alerts_cloud_run/variables.tf
@@ -133,14 +133,70 @@ variable "log_based_json_indicators" {
     })
     additional_filters         = optional(string)
     additional_group_by_fields = optional(list(string))
+    json_payload_labels        = optional(list(string), [])
   }))
+
   validation {
     condition = alltrue([
-      for k, v in var.log_based_json_indicators :
-      contains(["DEBUG", "INFO", "WARNING", "ERROR"], v.severity)
+      for key, indicator in var.log_based_json_indicators : (
+        contains(["DEBUG", "INFO", "WARNING", "ERROR"], indicator.severity) &&
+        indicator.condition_threshold.window >= 60 &&
+        indicator.condition_threshold.threshold >= 0 &&
+        can(regex("^[a-zA-Z0-9-_]+$", key)) &&
+        length(indicator.json_payload_labels) == length(distinct(indicator.json_payload_labels)) &&
+        alltrue([for label in indicator.json_payload_labels : can(regex("^[a-zA-Z0-9_]+$", label))])
+      )
     ])
-    error_message = "The 'severity' field must be one of: 'DEBUG', 'INFO', 'WARNING', 'ERROR'."
+    error_message = <<-EOT
+      Validation failed for log_based_json_indicators:
+      - severity must be one of: DEBUG, INFO, WARNING, ERROR.
+      - condition_threshold.window must be at least 60 seconds.
+      - condition_threshold.threshold must be zero or positive.
+      - indicator keys must contain only alphanumeric characters, dashes, or underscores.
+      - json_payload_labels must be unique and contain only alphanumeric characters and underscores.
+    EOT
   }
+
+  default = {}
+}
+
+variable "log_based_json_alert_policies" {
+  description = "Definition of alert policies, each potentially referencing multiple metrics."
+  type = map(object({
+    metric_keys = list(string)
+    condition_threshold = object({
+      window    = number
+      threshold = number
+    })
+    additional_group_by_fields = optional(list(string), [])
+    runbook_url                = optional(string)
+  }))
+
+  validation {
+    condition = alltrue([
+      for policy_key, policy in var.log_based_json_alert_policies : (
+        length(policy.metric_keys) > 0 &&
+        length(policy.metric_keys) == length(distinct(policy.metric_keys)) &&
+        policy.condition_threshold.window >= 60 &&
+        policy.condition_threshold.threshold >= 0 &&
+        (policy.runbook_url == null || can(regex("^https://", policy.runbook_url))) &&
+        alltrue([
+          for key in policy.metric_keys : can(regex("^[a-zA-Z0-9-_]+$", key))
+        ]) &&
+        can(regex("^[a-zA-Z0-9-_]+$", policy_key))
+      )
+    ])
+    error_message = <<-EOT
+      Validation failed for log_based_json_alert_policies:
+      - Each policy must reference at least one metric key.
+      - Metric keys within a policy must be unique.
+      - condition_threshold.window must be at least 60 seconds.
+      - condition_threshold.threshold must be zero or positive.
+      - policy keys and metric_keys must contain only alphanumeric characters, dashes (-), or underscores (_).
+      - runbook_url, if provided, must be a valid HTTPS URL.
+    EOT
+  }
+
   default = {}
 }