Skip to content

Commit 4e92aff

Browse files
committed
cloud_run_alerts: add flexible json-based log indicators and policies
1 parent ed3653b commit 4e92aff

File tree

3 files changed

+135
-44
lines changed

3 files changed

+135
-44
lines changed

modules/alerts_cloud_run/README.md

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,32 +43,49 @@ module "cloud_run_service_alerts" {
4343
}
4444
4545
enable_log_based_text_indicators = true
46-
log_based_text_indicators = {
47-
"scaling-failure" = {
48-
log_name_suffix = "requests"
49-
severity = "ERROR"
50-
text_payload_message = "The request was aborted because there was no available instance."
51-
condition_threshold = {
52-
window = 10 * local.minute
53-
threshold = 1
54-
}
55-
}
56-
}
57-
58-
enable_log_based_json_indicators = true
5946
log_based_json_indicators = {
6047
"email-bounce-failure" = {
6148
log_name_suffix = "stdout"
6249
severity = "ERROR"
63-
additional_filters = "jsonPayload.message=\"foo\" AND jsonPayload.method=<your_method_name>"
50+
additional_filters = "jsonPayload.message=\"bounce error\" AND jsonPayload.method=\"POST\""
6451
6552
condition_threshold = {
66-
window = 10 * local.minute
53+
window = 600 # 10 minutes
6754
threshold = 0
6855
}
56+
57+
label_extractors = {} # No custom labels extracted
58+
}
59+
60+
"payment-processing-error" = {
61+
log_name_suffix = "stderr"
62+
severity = "ERROR"
63+
additional_filters = "jsonPayload.error_code>=500"
64+
65+
condition_threshold = {
66+
window = 300 # 5 minutes
67+
threshold = 1
68+
}
69+
70+
label_extractors = {
71+
transaction_id = "EXTRACT(jsonPayload.transaction_id)"
72+
error_code = "EXTRACT(jsonPayload.error_code)"
73+
}
6974
}
7075
}
7176
77+
log_based_json_alert_policies = {
78+
"email-and-payment-errors" = {
79+
metric_keys = ["email-bounce-failure", "payment-processing-error"]
80+
condition_threshold = {
81+
window = 600
82+
threshold = 1
83+
}
84+
runbook_url = "https://github.com/org/repo/blob/main/docs/playbooks/json_alerts.md"
85+
}
86+
}
87+
88+
7289
service_4xx_configuration = {
7390
enabled = true
7491
window = 300

modules/alerts_cloud_run/alerts.tf

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ locals {
3030
minute = 60 * local.second
3131
hour = 60 * local.minute
3232
day = 24 * local.hour
33+
34+
35+
create_json_logging_alert_policy = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators
3336
}
3437

3538
# Common forward progress #
@@ -296,6 +299,7 @@ resource "google_logging_metric" "json_payload_logging_metric" {
296299
metric_descriptor {
297300
metric_kind = "DELTA"
298301
value_type = "INT64"
302+
299303
labels {
300304
key = "location"
301305
value_type = "STRING"
@@ -306,44 +310,58 @@ resource "google_logging_metric" "json_payload_logging_metric" {
306310
value_type = "STRING"
307311
description = "name of service"
308312
}
309-
}
310313

311-
label_extractors = {
312-
"location" = "EXTRACT(resource.labels.location)"
313-
"service_name" = "EXTRACT(resource.labels.service_name)"
314+
dynamic "labels" {
315+
for_each = each.value.label_extractors
316+
content {
317+
key = labels.key
318+
value_type = "STRING"
319+
description = "Custom extracted label: ${labels.key}"
320+
}
321+
}
314322
}
323+
324+
label_extractors = merge(
325+
{
326+
"location" = "EXTRACT(resource.labels.location)"
327+
"service_name" = "EXTRACT(resource.labels.service_name)"
328+
},
329+
each.value.label_extractors
330+
)
315331
}
316332

317333
resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
318-
count = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators ? 1 : 0
334+
for_each = local.create_json_logging_alert_policy ? var.log_based_json_alert_policies : {}
319335

320-
project = var.project_id
321-
322-
display_name = "LogBasedJSON-${local.resource_value}"
336+
project = var.project_id
337+
display_name = "${each.key}-${local.resource_value}"
323338
severity = "ERROR"
324339
combiner = "OR"
325340

326341
dynamic "conditions" {
327-
for_each = var.log_based_json_indicators
342+
for_each = each.value.metric_keys
328343

329344
content {
330-
display_name = "${conditions.key} logging high"
345+
display_name = "${conditions.value}-logging-high"
331346

332347
condition_threshold {
333348
filter = <<-EOT
334-
metric.type="${local.user_metric_root_prefix}/${local.resource_value}-${conditions.key}"
335-
resource.type="${local.resource_type}"
336-
EOT
349+
metric.type = "${local.user_metric_root_prefix}/${local.resource_value}-${conditions.value}"
350+
resource.type="${local.resource_type}"
351+
EOT
337352

338-
duration = "${conditions.value.condition_threshold.window}s"
353+
duration = "${each.value.condition_threshold.window}s"
339354
comparison = "COMPARISON_GT"
340-
threshold_value = conditions.value.condition_threshold.threshold
355+
threshold_value = each.value.condition_threshold.threshold
341356

342357
aggregations {
343358
alignment_period = "60s"
344359
per_series_aligner = "ALIGN_SUM"
345360
cross_series_reducer = "REDUCE_SUM"
346-
group_by_fields = distinct(concat(local.default_group_by_fields, conditions.value.additional_group_by_fields != null ? conditions.value.additional_group_by_fields : []))
361+
group_by_fields = distinct(concat(
362+
local.default_group_by_fields,
363+
conditions.value.additional_group_by_fields != null ? conditions.value.additional_group_by_fields : []
364+
))
347365
}
348366

349367
trigger {
@@ -355,28 +373,25 @@ resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
355373

356374
alert_strategy {
357375
auto_close = "${local.day}s"
358-
359376
notification_channel_strategy {
360377
renotify_interval = "${local.day}s"
361378
}
362379
}
363380

364381
dynamic "documentation" {
365-
for_each = var.runbook_urls.json_based_logs != null ? [1] : []
382+
for_each = each.value.runbook_url != null ? [1] : []
366383
content {
367-
content = var.runbook_urls.json_based_logs
384+
content = each.value.runbook_url
368385
mime_type = "text/markdown"
369386
}
370387
}
371388

372389
notification_channels = var.notification_channels_non_paging
373390

374-
depends_on = [
375-
google_logging_metric.json_payload_logging_metric
376-
]
391+
depends_on = [google_logging_metric.json_payload_logging_metric]
377392
}
378393

379-
# CR service specific #
394+
# CR service specific #
380395

381396
resource "google_monitoring_alert_policy" "service_4xx_alert_policy" {
382397
count = !local.is_job && var.service_4xx_configuration.enabled ? 1 : 0
@@ -565,7 +580,7 @@ resource "google_monitoring_alert_policy" "service_max_conns_alert_policy" {
565580
metric.type="${local.metric_root_prefix}/container/max_request_concurrencies"
566581
resource.type="${local.resource_type}"
567582
resource.label.${local.resource_label}="${local.resource_value}"
568-
${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}
583+
${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}
569584
EOT
570585

571586
duration = "${var.service_max_conns_configuration.window}s"
@@ -624,7 +639,7 @@ resource "google_monitoring_alert_policy" "job_failure_alert_policy" {
624639
metric.label.result="failed"
625640
resource.type="${local.resource_type}"
626641
resource.label.${local.resource_label}="${local.resource_value}"
627-
${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}
642+
${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}
628643
EOT
629644

630645
duration = "${var.job_failure_configuration.window}s"

modules/alerts_cloud_run/variables.tf

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ variable "enable_log_based_json_indicators" {
123123
}
124124

125125
variable "log_based_json_indicators" {
126-
description = "Map for log based indicators using JSON payload. Payload message is a regex match."
126+
description = "Map for log-based indicators with custom label extraction."
127127
type = map(object({
128128
log_name_suffix = string
129129
severity = string
@@ -133,14 +133,73 @@ variable "log_based_json_indicators" {
133133
})
134134
additional_filters = optional(string)
135135
additional_group_by_fields = optional(list(string))
136+
label_extractors = optional(map(string), {})
136137
}))
138+
139+
# Validation inspired by:
140+
# - Log severity levels: https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry#logseverity
141+
# - Metric naming conventions: https://cloud.google.com/logging/docs/logs-based-metrics/naming-restrictions
137142
validation {
138143
condition = alltrue([
139-
for k, v in var.log_based_json_indicators :
140-
contains(["DEBUG", "INFO", "WARNING", "ERROR"], v.severity)
144+
for key, indicator in var.log_based_json_indicators : (
145+
contains(["DEBUG", "INFO", "WARNING", "ERROR"], indicator.severity) &&
146+
indicator.condition_threshold.window >= 60 &&
147+
indicator.condition_threshold.threshold >= 0 &&
148+
can(regex("^[a-zA-Z0-9-_]+$", key)) &&
149+
length(keys(indicator.label_extractors)) == length(distinct(keys(indicator.label_extractors))) &&
150+
alltrue([for label_key in keys(indicator.label_extractors) : can(regex("^[a-zA-Z0-9_]+$", label_key))])
151+
)
141152
])
142-
error_message = "The 'severity' field must be one of: 'DEBUG', 'INFO', 'WARNING', 'ERROR'."
153+
error_message = <<-EOT
154+
Validation failed for log_based_json_indicators:
155+
- severity must be one of: DEBUG, INFO, WARNING, ERROR.
156+
- condition_threshold.window must be at least 60 seconds.
157+
- condition_threshold.threshold must be zero or positive.
158+
- indicator keys must contain only alphanumeric characters, dashes, or underscores.
159+
- label_extractors keys must be unique and contain only alphanumeric characters and underscores.
160+
EOT
143161
}
162+
163+
default = {}
164+
}
165+
166+
variable "log_based_json_alert_policies" {
167+
description = "Definition of alert policies, each potentially referencing multiple metrics."
168+
type = map(object({
169+
metric_keys = list(string)
170+
condition_threshold = object({
171+
window = number
172+
threshold = number
173+
})
174+
additional_group_by_fields = optional(list(string), [])
175+
runbook_url = optional(string)
176+
}))
177+
178+
validation {
179+
condition = alltrue([
180+
for policy_key, policy in var.log_based_json_alert_policies : (
181+
length(policy.metric_keys) > 0 &&
182+
length(policy.metric_keys) == length(distinct(policy.metric_keys)) &&
183+
policy.condition_threshold.window >= 60 &&
184+
policy.condition_threshold.threshold >= 0 &&
185+
(policy.runbook_url == null || can(regex("^https://", policy.runbook_url))) &&
186+
alltrue([
187+
for key in policy.metric_keys : can(regex("^[a-zA-Z0-9-_]+$", key))
188+
]) &&
189+
can(regex("^[a-zA-Z0-9-_]+$", policy_key))
190+
)
191+
])
192+
error_message = <<-EOT
193+
Validation failed for log_based_json_alert_policies:
194+
- Each policy must reference at least one metric key.
195+
- Metric keys within a policy must be unique.
196+
- condition_threshold.window must be at least 60 seconds.
197+
- condition_threshold.threshold must be zero or positive.
198+
- policy keys and metric_keys must contain only alphanumeric characters, dashes (-), or underscores (_).
199+
- runbook_url, if provided, must be a valid HTTPS URL.
200+
EOT
201+
}
202+
144203
default = {}
145204
}
146205

0 commit comments

Comments
 (0)