Skip to content

Commit 6fa61af

Browse files
committed
cloud_run_alerts: add flexible json-based log indicators and policies
1 parent ed3653b commit 6fa61af

File tree

3 files changed

+132
-34
lines changed

3 files changed

+132
-34
lines changed

modules/alerts_cloud_run/README.md

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,39 @@ module "cloud_run_service_alerts" {
4343
}
4444
4545
enable_log_based_text_indicators = true
46-
log_based_text_indicators = {
47-
"scaling-failure" = {
48-
log_name_suffix = "requests"
49-
severity = "ERROR"
50-
text_payload_message = "The request was aborted because there was no available instance."
46+
log_based_json_indicators = {
47+
"email-bounce-failure" = {
48+
log_name_suffix = "stdout"
49+
severity = "ERROR"
50+
additional_filters = "jsonPayload.message=\"bounce error\" AND jsonPayload.method=\"POST\""
51+
5152
condition_threshold = {
52-
window = 10 * local.minute
53+
window = 600 # 10 minutes
54+
threshold = 0
55+
}
56+
}
57+
58+
"payment-processing-error" = {
59+
log_name_suffix = "stderr"
60+
severity = "ERROR"
61+
additional_filters = "jsonPayload.error_code>=500"
62+
63+
condition_threshold = {
64+
window = 300 # 5 minutes
65+
threshold = 1
66+
}
67+
json_payload_labels = ["transaction_id", "error_code"]
68+
}
69+
}
70+
71+
log_based_json_alert_policies = {
72+
"email-and-payment-errors" = {
73+
metric_keys = ["email-bounce-failure", "payment-processing-error"]
74+
condition_threshold = {
75+
window = 600
5376
threshold = 1
5477
}
78+
runbook_url = "https://github.com/org/repo/blob/main/docs/playbooks/json_alerts.md"
5579
}
5680
}
5781

modules/alerts_cloud_run/alerts.tf

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ locals {
3030
minute = 60 * local.second
3131
hour = 60 * local.minute
3232
day = 24 * local.hour
33+
34+
35+
create_json_logging_alert_policy = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators
3336
}
3437

3538
# Common forward progress #
@@ -296,6 +299,7 @@ resource "google_logging_metric" "json_payload_logging_metric" {
296299
metric_descriptor {
297300
metric_kind = "DELTA"
298301
value_type = "INT64"
302+
299303
labels {
300304
key = "location"
301305
value_type = "STRING"
@@ -306,44 +310,61 @@ resource "google_logging_metric" "json_payload_logging_metric" {
306310
value_type = "STRING"
307311
description = "name of service"
308312
}
309-
}
310313

311-
label_extractors = {
312-
"location" = "EXTRACT(resource.labels.location)"
313-
"service_name" = "EXTRACT(resource.labels.service_name)"
314+
dynamic "labels" {
315+
for_each = each.value.json_payload_labels
316+
content {
317+
key = labels.value
318+
value_type = "STRING"
319+
description = "Extracted from jsonPayload: ${labels.value}"
320+
}
321+
}
314322
}
323+
324+
label_extractors = merge(
325+
{
326+
"location" = "EXTRACT(resource.labels.location)"
327+
"service_name" = "EXTRACT(resource.labels.service_name)"
328+
},
329+
{
330+
for label in each.value.json_payload_labels :
331+
label => "EXTRACT(jsonPayload.${label})"
332+
}
333+
)
315334
}
316335

317336
resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
318-
count = length(keys(var.log_based_json_indicators)) > 0 && var.enable_log_based_json_indicators ? 1 : 0
337+
for_each = local.create_json_logging_alert_policy ? var.log_based_json_alert_policies : {}
319338

320-
project = var.project_id
321-
322-
display_name = "LogBasedJSON-${local.resource_value}"
339+
project = var.project_id
340+
display_name = "${each.key}-${local.resource_value}"
323341
severity = "ERROR"
324342
combiner = "OR"
325343

326344
dynamic "conditions" {
327-
for_each = var.log_based_json_indicators
345+
for_each = each.value.metric_keys
328346

329347
content {
330-
display_name = "${conditions.key} logging high"
348+
display_name = "${conditions.value}-logging-high"
331349

332350
condition_threshold {
333351
filter = <<-EOT
334-
metric.type="${local.user_metric_root_prefix}/${local.resource_value}-${conditions.key}"
335-
resource.type="${local.resource_type}"
336-
EOT
352+
metric.type="logging.googleapis.com/user/${local.resource_value}-${conditions.value}"
353+
resource.type="${local.resource_type}"
354+
EOT
337355

338-
duration = "${conditions.value.condition_threshold.window}s"
356+
duration = "${each.value.condition_threshold.window}s"
339357
comparison = "COMPARISON_GT"
340-
threshold_value = conditions.value.condition_threshold.threshold
358+
threshold_value = each.value.condition_threshold.threshold
341359

342360
aggregations {
343361
alignment_period = "60s"
344362
per_series_aligner = "ALIGN_SUM"
345363
cross_series_reducer = "REDUCE_SUM"
346-
group_by_fields = distinct(concat(local.default_group_by_fields, conditions.value.additional_group_by_fields != null ? conditions.value.additional_group_by_fields : []))
364+
group_by_fields = distinct(concat(
365+
["resource.label.location"],
366+
each.value.additional_group_by_fields != null ? each.value.additional_group_by_fields : []
367+
))
347368
}
348369

349370
trigger {
@@ -355,28 +376,25 @@ resource "google_monitoring_alert_policy" "json_payload_logging_alert_policy" {
355376

356377
alert_strategy {
357378
auto_close = "${local.day}s"
358-
359379
notification_channel_strategy {
360380
renotify_interval = "${local.day}s"
361381
}
362382
}
363383

364384
dynamic "documentation" {
365-
for_each = var.runbook_urls.json_based_logs != null ? [1] : []
385+
for_each = each.value.runbook_url != null ? [1] : []
366386
content {
367-
content = var.runbook_urls.json_based_logs
387+
content = each.value.runbook_url
368388
mime_type = "text/markdown"
369389
}
370390
}
371391

372392
notification_channels = var.notification_channels_non_paging
373393

374-
depends_on = [
375-
google_logging_metric.json_payload_logging_metric
376-
]
394+
depends_on = [google_logging_metric.json_payload_logging_metric]
377395
}
378396

379-
# CR service specific #
397+
# CR service specific #
380398

381399
resource "google_monitoring_alert_policy" "service_4xx_alert_policy" {
382400
count = !local.is_job && var.service_4xx_configuration.enabled ? 1 : 0
@@ -565,7 +583,7 @@ resource "google_monitoring_alert_policy" "service_max_conns_alert_policy" {
565583
metric.type="${local.metric_root_prefix}/container/max_request_concurrencies"
566584
resource.type="${local.resource_type}"
567585
resource.label.${local.resource_label}="${local.resource_value}"
568-
${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}
586+
${var.service_max_conns_configuration.additional_filters != null ? var.service_max_conns_configuration.additional_filters : ""}
569587
EOT
570588

571589
duration = "${var.service_max_conns_configuration.window}s"
@@ -624,7 +642,7 @@ resource "google_monitoring_alert_policy" "job_failure_alert_policy" {
624642
metric.label.result="failed"
625643
resource.type="${local.resource_type}"
626644
resource.label.${local.resource_label}="${local.resource_value}"
627-
${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}
645+
${var.job_failure_configuration.additional_filters != null ? var.job_failure_configuration.additional_filters : ""}
628646
EOT
629647

630648
duration = "${var.job_failure_configuration.window}s"

modules/alerts_cloud_run/variables.tf

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,70 @@ variable "log_based_json_indicators" {
133133
})
134134
additional_filters = optional(string)
135135
additional_group_by_fields = optional(list(string))
136+
json_payload_labels = optional(list(string), [])
136137
}))
138+
137139
validation {
138140
condition = alltrue([
139-
for k, v in var.log_based_json_indicators :
140-
contains(["DEBUG", "INFO", "WARNING", "ERROR"], v.severity)
141+
for key, indicator in var.log_based_json_indicators : (
142+
contains(["DEBUG", "INFO", "WARNING", "ERROR"], indicator.severity) &&
143+
indicator.condition_threshold.window >= 60 &&
144+
indicator.condition_threshold.threshold >= 0 &&
145+
can(regex("^[a-zA-Z0-9-_]+$", key)) &&
146+
length(indicator.json_payload_labels) == length(distinct(indicator.json_payload_labels)) &&
147+
alltrue([for label in indicator.json_payload_labels : can(regex("^[a-zA-Z0-9_]+$", label))])
148+
)
141149
])
142-
error_message = "The 'severity' field must be one of: 'DEBUG', 'INFO', 'WARNING', 'ERROR'."
150+
error_message = <<-EOT
151+
Validation failed for log_based_json_indicators:
152+
- severity must be one of: DEBUG, INFO, WARNING, ERROR.
153+
- condition_threshold.window must be at least 60 seconds.
154+
- condition_threshold.threshold must be zero or positive.
155+
- indicator keys must contain only alphanumeric characters, dashes, or underscores.
156+
- json_payload_labels must be unique and contain only alphanumeric characters and underscores.
157+
EOT
143158
}
159+
160+
default = {}
161+
}
162+
163+
variable "log_based_json_alert_policies" {
164+
description = "Definition of alert policies, each potentially referencing multiple metrics."
165+
type = map(object({
166+
metric_keys = list(string)
167+
condition_threshold = object({
168+
window = number
169+
threshold = number
170+
})
171+
additional_group_by_fields = optional(list(string), [])
172+
runbook_url = optional(string)
173+
}))
174+
175+
validation {
176+
condition = alltrue([
177+
for policy_key, policy in var.log_based_json_alert_policies : (
178+
length(policy.metric_keys) > 0 &&
179+
length(policy.metric_keys) == length(distinct(policy.metric_keys)) &&
180+
policy.condition_threshold.window >= 60 &&
181+
policy.condition_threshold.threshold >= 0 &&
182+
(policy.runbook_url == null || can(regex("^https://", policy.runbook_url))) &&
183+
alltrue([
184+
for key in policy.metric_keys : can(regex("^[a-zA-Z0-9-_]+$", key))
185+
]) &&
186+
can(regex("^[a-zA-Z0-9-_]+$", policy_key))
187+
)
188+
])
189+
error_message = <<-EOT
190+
Validation failed for log_based_json_alert_policies:
191+
- Each policy must reference at least one metric key.
192+
- Metric keys within a policy must be unique.
193+
- condition_threshold.window must be at least 60 seconds.
194+
- condition_threshold.threshold must be zero or positive.
195+
- policy keys and metric_keys must contain only alphanumeric characters, dashes (-), or underscores (_).
196+
- runbook_url, if provided, must be a valid HTTPS URL.
197+
EOT
198+
}
199+
144200
default = {}
145201
}
146202

0 commit comments

Comments
 (0)