cloud_run_alerts: add flexible json-based log indicators and policies

andriipetruk · andriipetruk · commit 4e92aff220ac · 2025-04-02T12:16:01.000-07:00
diff --git a/modules/alerts_cloud_run/README.md b/modules/alerts_cloud_run/README.md
@@ -43,32 +43,49 @@ module "cloud_run_service_alerts" {
   }
 
   enable_log_based_text_indicators = true
-  log_based_text_indicators = {
-    "scaling-failure" = {
-      log_name_suffix      = "requests"
-      severity             = "ERROR"
-      text_payload_message = "The request was aborted because there was no available instance."
-      condition_threshold = {
-        window    = 10 * local.minute
-        threshold = 1
-      }
-    }
-  }
-
-  enable_log_based_json_indicators = true
   log_based_json_indicators = {
     "email-bounce-failure" = {
       log_name_suffix    = "stdout"
       severity           = "ERROR"
-      additional_filters = "jsonPayload.message=\"foo\" AND jsonPayload.method=<your_method_name>"
+      additional_filters = "jsonPayload.message=\"bounce error\" AND jsonPayload.method=\"POST\""
 
       condition_threshold = {
-        window    = 10 * local.minute
+        window    = 600  # 10 minutes
         threshold = 0
       }
+
+      label_extractors = {}  # No custom labels extracted
+    }
+
+    "payment-processing-error" = {
+      log_name_suffix    = "stderr"
+      severity           = "ERROR"
+      additional_filters = "jsonPayload.error_code>=500"
+
+      condition_threshold = {
+        window    = 300  # 5 minutes
+        threshold = 1
+      }
+
+      label_extractors = {
+        transaction_id = "EXTRACT(jsonPayload.transaction_id)"
+        error_code     = "EXTRACT(jsonPayload.error_code)"
+      }
     }
   }
 
+  log_based_json_alert_policies = {
+    "email-and-payment-errors" = {
+      metric_keys = ["email-bounce-failure", "payment-processing-error"]
+      condition_threshold = {
+        window    = 600
+        threshold = 1
+      }
+      runbook_url = "https://github.com/org/repo/blob/main/docs/playbooks/json_alerts.md"
+    }
+  }
+
+
   service_4xx_configuration = {
     enabled   = true
     window    = 300
diff --git a/modules/alerts_cloud_run/alerts.tf b/modules/alerts_cloud_run/alerts.tf
@@ -30,6 +30,9 @@ locals {
   minute = 60 * local.second
   hour   = 60 * local.minute
   day    = 24 * local.hour
+
+
+  create_json_logging_alert_policy = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators
 }
 
 # Common forward progress #
@@ -296,6 +299,7 @@ resource "google_logging_metric" "json_payload_logging_metric" {
   metric_descriptor {
     metric_kind = "DELTA"
     value_type  = "INT64"
+
     labels {
       key         = "location"
       value_type  = "STRING"
@@ -306,44 +310,58 @@ resource "google_logging_metric" "json_payload_logging_metric" {
       value_type  = "STRING"
       description = "name of service"
     }
-  }
 
-  label_extractors = {
-    "location"     = "EXTRACT(resource.labels.location)"
-    "service_name" = "EXTRACT(resource.labels.service_name)"
+    dynamic "labels" {
+      for_each = each.value.label_extractors
+      content {
+        key         = labels.key
+        value_type  = "STRING"
+        description = "Custom extracted label: ${labels.key}"
+      }
+    }
   }
+
+  label_extractors = merge(
+    {
+      "location"     = "EXTRACT(resource.labels.location)"
+      "service_name" = "EXTRACT(resource.labels.service_name)"
+    },
+    each.value.label_extractors
+  )
 }
 
 resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
-  count = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators ? 1 : 0
+  for_each = local.create_json_logging_alert_policy ? var.log_based_json_alert_policies : {}
 
-  project = var.project_id
-
-  display_name = "LogBasedJSON-${local.resource_value}"
+  project      = var.project_id
+  display_name = "${each.key}-${local.resource_value}"
   severity     = "ERROR"
   combiner     = "OR"
 
   dynamic "conditions" {
-    for_each = var.log_based_json_indicators
+    for_each = each.value.metric_keys
 
     content {
-      display_name = "${conditions.key} logging high"
+      display_name = "${conditions.value}-logging-high"
 
       condition_threshold {
         filter = <<-EOT
-            metric.type="${local.user_metric_root_prefix}/${local.resource_value}-${conditions.key}"
-            resource.type="${local.resource_type}"
-          EOT
+          metric.type = "${local.user_metric_root_prefix}/${local.resource_value}-${conditions.value}"
+          resource.type="${local.resource_type}"
+        EOT
 
-        duration        = "${conditions.value.condition_threshold.window}s"
+        duration        = "${each.value.condition_threshold.window}s"
         comparison      = "COMPARISON_GT"
-        threshold_value = conditions.value.condition_threshold.threshold
+        threshold_value = each.value.condition_threshold.threshold
 
         aggregations {
           alignment_period     = "60s"
           per_series_aligner   = "ALIGN_SUM"
           cross_series_reducer = "REDUCE_SUM"
-          group_by_fields      = distinct(concat(local.default_group_by_fields, conditions.value.additional_group_by_fields != null ? conditions.value.additional_group_by_fields : []))
+          group_by_fields = distinct(concat(
+            local.default_group_by_fields,
+            conditions.value.additional_group_by_fields != null ? conditions.value.additional_group_by_fields : []
+          ))
         }
 
         trigger {
@@ -355,28 +373,25 @@ resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
 
   alert_strategy {
     auto_close = "${local.day}s"
-
     notification_channel_strategy {
       renotify_interval = "${local.day}s"
     }
   }
 
   dynamic "documentation" {
-    for_each = var.runbook_urls.json_based_logs != null ? [1] : []
+    for_each = each.value.runbook_url != null ? [1] : []
     content {
-      content   = var.runbook_urls.json_based_logs
+      content   = each.value.runbook_url
       mime_type = "text/markdown"
     }
   }
 
   notification_channels = var.notification_channels_non_paging
 
-  depends_on = [
-    google_logging_metric.json_payload_logging_metric
-  ]
+  depends_on = [google_logging_metric.json_payload_logging_metric]
 }
 
-# CR service specific # 
+# CR service specific #
 
 resource "google_monitoring_alert_policy" "service_4xx_alert_policy" {
   count = !local.is_job && var.service_4xx_configuration.enabled ? 1 : 0
@@ -565,7 +580,7 @@ resource "google_monitoring_alert_policy" "service_max_conns_alert_policy" {
         metric.type="${local.metric_root_prefix}/container/max_request_concurrencies"
         resource.type="${local.resource_type}"
         resource.label.${local.resource_label}="${local.resource_value}"
-        ${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}        
+        ${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}
       EOT
 
       duration        = "${var.service_max_conns_configuration.window}s"
@@ -624,7 +639,7 @@ resource "google_monitoring_alert_policy" "job_failure_alert_policy" {
         metric.label.result="failed"
         resource.type="${local.resource_type}"
         resource.label.${local.resource_label}="${local.resource_value}"
-        ${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}        
+        ${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}
       EOT
 
       duration        = "${var.job_failure_configuration.window}s"
diff --git a/modules/alerts_cloud_run/variables.tf b/modules/alerts_cloud_run/variables.tf
@@ -123,7 +123,7 @@ variable "enable_log_based_json_indicators" {
 }
 
 variable "log_based_json_indicators" {
-  description = "Map for log based indicators using JSON payload. Payload message is a regex match."
+  description = "Map for log-based indicators with custom label extraction."
   type = map(object({
     log_name_suffix = string
     severity        = string
@@ -133,14 +133,73 @@ variable "log_based_json_indicators" {
     })
     additional_filters         = optional(string)
     additional_group_by_fields = optional(list(string))
+    label_extractors           = optional(map(string), {})
   }))
+
+  # Validation inspired by:
+  # - Log severity levels: https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry#logseverity
+  # - Metric naming conventions: https://cloud.google.com/logging/docs/logs-based-metrics/naming-restrictions
   validation {
     condition = alltrue([
-      for k, v in var.log_based_json_indicators :
-      contains(["DEBUG", "INFO", "WARNING", "ERROR"], v.severity)
+      for key, indicator in var.log_based_json_indicators : (
+        contains(["DEBUG", "INFO", "WARNING", "ERROR"], indicator.severity) &&
+        indicator.condition_threshold.window >= 60 &&
+        indicator.condition_threshold.threshold >= 0 &&
+        can(regex("^[a-zA-Z0-9-_]+$", key)) &&
+        length(keys(indicator.label_extractors)) == length(distinct(keys(indicator.label_extractors))) &&
+        alltrue([for label_key in keys(indicator.label_extractors) : can(regex("^[a-zA-Z0-9_]+$", label_key))])
+      )
     ])
-    error_message = "The 'severity' field must be one of: 'DEBUG', 'INFO', 'WARNING', 'ERROR'."
+    error_message = <<-EOT
+      Validation failed for log_based_json_indicators:
+      - severity must be one of: DEBUG, INFO, WARNING, ERROR.
+      - condition_threshold.window must be at least 60 seconds.
+      - condition_threshold.threshold must be zero or positive.
+      - indicator keys must contain only alphanumeric characters, dashes, or underscores.
+      - label_extractors keys must be unique and contain only alphanumeric characters and underscores.
+    EOT
   }
+
+  default = {}
+}
+
+variable "log_based_json_alert_policies" {
+  description = "Definition of alert policies, each potentially referencing multiple metrics."
+  type = map(object({
+    metric_keys = list(string)
+    condition_threshold = object({
+      window    = number
+      threshold = number
+    })
+    additional_group_by_fields = optional(list(string), [])
+    runbook_url                = optional(string)
+  }))
+
+  validation {
+    condition = alltrue([
+      for policy_key, policy in var.log_based_json_alert_policies : (
+        length(policy.metric_keys) > 0 &&
+        length(policy.metric_keys) == length(distinct(policy.metric_keys)) &&
+        policy.condition_threshold.window >= 60 &&
+        policy.condition_threshold.threshold >= 0 &&
+        (policy.runbook_url == null || can(regex("^https://", policy.runbook_url))) &&
+        alltrue([
+          for key in policy.metric_keys : can(regex("^[a-zA-Z0-9-_]+$", key))
+        ]) &&
+        can(regex("^[a-zA-Z0-9-_]+$", policy_key))
+      )
+    ])
+    error_message = <<-EOT
+      Validation failed for log_based_json_alert_policies:
+      - Each policy must reference at least one metric key.
+      - Metric keys within a policy must be unique.
+      - condition_threshold.window must be at least 60 seconds.
+      - condition_threshold.threshold must be zero or positive.
+      - policy keys and metric_keys must contain only alphanumeric characters, dashes (-), or underscores (_).
+      - runbook_url, if provided, must be a valid HTTPS URL.
+    EOT
+  }
+
   default = {}
 }