Skip to content

Commit be84c6c

Browse files
authored
Merge pull request #164 from massgov/DP-30933-alert-names
DP-30933: add generic alert module, rename existing alerts
2 parents 7ba701c + ce0b8a8 commit be84c6c

File tree

17 files changed

+421
-310
lines changed

17 files changed

+421
-310
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changelog
22

3+
## [1.0.83] - 2023-12-04
4+
5+
- [New Relic] Improve naming for all alert conditions.
6+
37
## [1.0.82] - 2023-12-01
48

59
- [CloudFront Geo Restriction] Mark country codes as nonsensitive in terraform.

newrelic/alert-conditions-cloudfront/main.tf

Lines changed: 29 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,58 +11,47 @@ locals {
1111

1212
}
1313

14-
resource "newrelic_nrql_alert_condition" "error_rate" {
14+
module "error_rate" {
15+
source = "../nrql-alert"
16+
1517
account_id = var.account_id
1618
policy_id = var.alert_policy_id
17-
type = "static"
18-
name = "${var.name_prefix} - Error Rate"
19-
enabled = true
20-
violation_time_limit_seconds = 259200
21-
22-
nrql {
23-
query = "SELECT average(aws.cloudfront.TotalErrorRate) FROM Metric ${local.filter_subquery} FACET entity.name"
24-
}
25-
26-
critical {
27-
operator = "above"
28-
threshold = var.error_rate_threshold
29-
threshold_duration = var.critical_threshold_duration
30-
threshold_occurrences = "all"
31-
}
32-
fill_option = "none"
19+
name = format(
20+
"%s - Error rate over %s%% for at least %d seconds",
21+
var.name_prefix,
22+
replace(format("%f", var.error_rate_threshold), "/\\.0+$/", ""),
23+
var.critical_threshold_duration
24+
)
25+
26+
nrql_query = "SELECT average(aws.cloudfront.TotalErrorRate) FROM Metric ${local.filter_subquery} FACET entity.name"
27+
critical_threshold = var.error_rate_threshold
28+
critical_threshold_duration = var.critical_threshold_duration
3329
aggregation_window = var.aggregation_window
3430
aggregation_method = "event_flow"
3531
aggregation_delay = 120
36-
37-
open_violation_on_expiration = false
38-
close_violations_on_expiration = false
32+
tags = var.tags
3933
}
4034

41-
resource "newrelic_nrql_alert_condition" "throughput" {
35+
module "throughput" {
36+
source = "../nrql-alert"
4237
count = (var.throughput_enabled ? 1 : 0)
4338

4439
account_id = var.account_id
4540
policy_id = var.alert_policy_id
46-
type = "static"
47-
name = "${var.name_prefix} - Throughput"
48-
enabled = true
49-
violation_time_limit_seconds = 259200
50-
51-
nrql {
52-
query = "SELECT average(aws.cloudfront.Requests) FROM Metric ${local.filter_subquery} FACET entity.name"
53-
}
54-
55-
critical {
56-
operator = "below"
57-
threshold = var.throughput_threshold
58-
threshold_duration = var.critical_threshold_duration
59-
threshold_occurrences = "all"
60-
}
61-
fill_option = "none"
41+
name = format("%s - Less than %d requests per %d seconds for over %d seconds",
42+
var.name_prefix,
43+
var.throughput_threshold,
44+
var.aggregation_window,
45+
var.critical_threshold_duration
46+
)
47+
48+
nrql_query = "SELECT average(aws.cloudfront.Requests) FROM Metric ${local.filter_subquery} FACET entity.name"
49+
50+
critical_operator = "below"
51+
critical_threshold = var.throughput_threshold
52+
critical_threshold_duration = var.critical_threshold_duration
6253
aggregation_window = var.aggregation_window
6354
aggregation_method = "event_flow"
6455
aggregation_delay = 120
65-
66-
open_violation_on_expiration = false
67-
close_violations_on_expiration = false
56+
tags = var.tags
6857
}

newrelic/alert-conditions-cloudfront/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,9 @@ variable "throughput_threshold" {
5454
description = "Minimum number of requests per minute before triggering throughput alert."
5555
default = 5
5656
}
57+
58+
variable "tags" {
59+
type = map(any)
60+
description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
61+
default = {}
62+
}

newrelic/alert-conditions-ec2/main.tf

Lines changed: 60 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -27,121 +27,96 @@ locals {
2727
duration = var.critical_threshold_duration == null ? local.default_duration : var.critical_threshold_duration
2828
}
2929

30-
resource "newrelic_nrql_alert_condition" "alert" {
30+
31+
module "cpu" {
32+
source = "../nrql-alert"
33+
3134
account_id = var.account_id
3235
policy_id = var.alert_policy_id
33-
type = "static"
34-
name = "${var.name_prefix} - CPU"
35-
enabled = true
36-
violation_time_limit_seconds = 259200
37-
38-
nrql {
39-
query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET aws.ec2.InstanceId"
40-
}
41-
42-
critical {
43-
operator = "above"
44-
threshold = var.critical_threshold
45-
threshold_duration = local.duration
46-
threshold_occurrences = "all"
47-
}
48-
fill_option = "none"
36+
name = format(
37+
"%s - CPU utilization over %s%% for at least %d seconds",
38+
var.name_prefix,
39+
replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
40+
local.duration
41+
)
42+
43+
nrql_query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET aws.ec2.InstanceId"
44+
critical_threshold = var.critical_threshold
45+
critical_threshold_duration = local.duration
4946
aggregation_window = local.window
5047
aggregation_method = "event_timer"
5148
aggregation_timer = local.timer
52-
expiration_duration = 600
53-
open_violation_on_expiration = false
54-
close_violations_on_expiration = false
49+
tags = var.tags
5550
}
5651

57-
resource "newrelic_nrql_alert_condition" "loss_of_signal" {
52+
module "loss_of_signal" {
5853
count = (var.alert_loss_of_signal ? 1 : 0)
54+
source = "../nrql-alert"
5955

6056
account_id = var.account_id
6157
policy_id = var.alert_policy_id
62-
type = "static"
63-
name = "${var.name_prefix} - Loss of Signal"
64-
enabled = true
65-
violation_time_limit_seconds = 259200
66-
67-
nrql {
68-
query = "SELECT average(aws.ec2.CPUUtilization) FROM Metric ${local.filter_subquery} FACET tags.Name"
69-
}
70-
71-
critical {
72-
operator = "above"
73-
# This should never actually trigger, since CPUUtilization is a percent.
74-
# We don't care about this condition, we're just using this alert to use
75-
# the "open_violation_on_expiration" parameter to detect signal loss (by
76-
# instance name instead of instance id). Otherwise, every instance refresh
77-
# causes alerts/an "incident" in NR.
78-
threshold = 101
79-
threshold_duration = local.duration
80-
threshold_occurrences = "all"
81-
}
82-
fill_option = "none"
58+
name = format(
59+
"%s - No metrics reported for at least %d seconds",
60+
var.name_prefix,
61+
600
62+
)
63+
64+
nrql_query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET tags.Name"
65+
# This should never actually trigger, since CPUUtilization is a percent.
66+
# We don't care about this condition, we're just using this alert to use
67+
# the "open_violation_on_expiration" parameter to detect signal loss (by
68+
# instance name instead of instance id). Otherwise, every instance refresh
69+
# causes alerts/an "incident" in NR.
70+
critical_threshold = 101
71+
critical_threshold_duration = local.duration
8372
aggregation_window = local.window
8473
aggregation_method = "event_timer"
8574
aggregation_timer = local.timer
8675
expiration_duration = 600
8776
open_violation_on_expiration = true
88-
close_violations_on_expiration = false
77+
tags = var.tags
8978
}
9079

91-
resource "newrelic_nrql_alert_condition" "memory" {
80+
module "memory" {
9281
count = (var.use_agent_metrics ? 1 : 0)
82+
source = "../nrql-alert"
9383

9484
account_id = var.account_id
9585
policy_id = var.alert_policy_id
96-
type = "static"
97-
name = "${var.name_prefix} - Memory"
98-
enabled = true
99-
violation_time_limit_seconds = 259200
100-
101-
nrql {
102-
query = "SELECT average(memoryUsedPercent) FROM SystemSample ${local.filter_subquery} FACET aws.ec2.InstanceId"
103-
}
104-
105-
critical {
106-
operator = "above"
107-
threshold = var.critical_threshold
108-
threshold_duration = local.duration
109-
threshold_occurrences = "all"
110-
}
111-
fill_option = "none"
112-
aggregation_window = local.window
86+
name = format(
87+
"%s - Memory usage over %s%% for at least %d seconds",
88+
var.name_prefix,
89+
replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
90+
local.duration
91+
)
92+
93+
nrql_query = "SELECT average(memoryUsedPercent) FROM SystemSample ${local.filter_subquery} FACET aws.ec2.InstanceId"
94+
critical_threshold = var.critical_threshold
95+
critical_threshold_duration = local.duration
11396
aggregation_method = "event_timer"
97+
aggregation_window = local.window
11498
aggregation_timer = local.timer
115-
expiration_duration = 600
116-
open_violation_on_expiration = false
117-
close_violations_on_expiration = false
99+
tags = var.tags
118100
}
119101

120-
resource "newrelic_nrql_alert_condition" "storage" {
102+
module "storage" {
121103
count = (var.use_agent_metrics ? 1 : 0)
104+
source = "../nrql-alert"
122105

123106
account_id = var.account_id
124107
policy_id = var.alert_policy_id
125-
type = "static"
126-
name = "${var.name_prefix} - Storage"
127-
enabled = true
128-
violation_time_limit_seconds = 259200
129-
130-
nrql {
131-
query = "SELECT average(diskUsedPercent) FROM StorageSample ${local.filter_subquery} FACET `tags.Name`, mountPoint"
132-
}
133-
134-
critical {
135-
operator = "above"
136-
threshold = var.critical_threshold
137-
threshold_duration = local.duration
138-
threshold_occurrences = "all"
139-
}
140-
fill_option = "none"
141-
aggregation_window = local.window
108+
name = format(
109+
"%s - Storage usage over %s%% for at least %d seconds",
110+
var.name_prefix,
111+
replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
112+
local.duration
113+
)
114+
115+
nrql_query = "SELECT average(diskUsedPercent) FROM StorageSample ${local.filter_subquery} FACET `tags.Name`, mountPoint"
116+
critical_threshold = var.critical_threshold
117+
critical_threshold_duration = local.duration
142118
aggregation_method = "event_timer"
119+
aggregation_window = local.window
143120
aggregation_timer = local.timer
144-
expiration_duration = 600
145-
open_violation_on_expiration = false
146-
close_violations_on_expiration = false
121+
tags = var.tags
147122
}

newrelic/alert-conditions-ec2/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,9 @@ variable "use_agent_metrics" {
6060
description = "Build the alerts using the extended metrics generated by the New Relic EC2 agent."
6161
default = false
6262
}
63+
64+
variable "tags" {
65+
type = map(any)
66+
description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
67+
default = {}
68+
}

newrelic/alert-conditions-ecs-cluster/main.tf

Lines changed: 27 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -8,59 +8,47 @@ locals {
88
filter_subqueries_and = join(" AND ", compact([local.aws_accounts_subquery, local.cluster_names_subquery]))
99

1010
filter_subquery = length(local.filter_subqueries_and) == 0 ? "" : "WHERE (${local.filter_subqueries_and})"
11-
1211
}
1312

14-
resource "newrelic_nrql_alert_condition" "cpu" {
13+
module "cpu" {
14+
source = "../nrql-alert"
15+
1516
account_id = var.account_id
1617
policy_id = var.alert_policy_id
17-
type = "static"
18-
name = "${var.name_prefix} - CPU"
19-
enabled = true
20-
violation_time_limit_seconds = 259200
18+
name = format(
19+
"%s - CPU utilization over %s%% for at least %d seconds",
20+
var.name_prefix,
21+
replace(format("%f", var.cpu_threshold), "/\\.0+$/", ""),
22+
var.critical_threshold_duration
23+
)
2124

22-
nrql {
23-
query = "SELECT average(aws.ecs.CPUUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
24-
}
25+
nrql_query = "SELECT average(aws.ecs.CPUUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
2526

26-
critical {
27-
operator = "above"
28-
threshold = var.cpu_threshold
29-
threshold_duration = var.critical_threshold_duration
30-
threshold_occurrences = "all"
31-
}
32-
fill_option = "none"
27+
critical_threshold = var.cpu_threshold
28+
critical_threshold_duration = var.critical_threshold_duration
3329
aggregation_window = var.aggregation_window
3430
aggregation_method = "event_flow"
3531
aggregation_delay = 120
36-
37-
open_violation_on_expiration = false
38-
close_violations_on_expiration = false
32+
tags = var.tags
3933
}
4034

41-
resource "newrelic_nrql_alert_condition" "memory" {
35+
module "memory" {
36+
source = "../nrql-alert"
37+
4238
account_id = var.account_id
4339
policy_id = var.alert_policy_id
44-
type = "static"
45-
name = "${var.name_prefix} - Memory"
46-
enabled = true
47-
violation_time_limit_seconds = 259200
48-
49-
nrql {
50-
query = "SELECT average(aws.ecs.MemoryUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
51-
}
52-
53-
critical {
54-
operator = "above"
55-
threshold = var.memory_threshold
56-
threshold_duration = var.critical_threshold_duration
57-
threshold_occurrences = "all"
58-
}
59-
fill_option = "none"
40+
name = format(
41+
"%s - Memory usage over %s%% for at least %d seconds",
42+
var.name_prefix,
43+
replace(format("%f", var.memory_threshold), "/\\.0+$/", ""),
44+
var.critical_threshold_duration
45+
)
46+
47+
nrql_query = "SELECT average(aws.ecs.MemoryUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
48+
critical_threshold = var.memory_threshold
49+
critical_threshold_duration = var.critical_threshold_duration
6050
aggregation_window = var.aggregation_window
6151
aggregation_method = "event_flow"
6252
aggregation_delay = 120
63-
64-
open_violation_on_expiration = false
65-
close_violations_on_expiration = false
53+
tags = var.tags
6654
}

newrelic/alert-conditions-ecs-cluster/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,9 @@ variable "memory_threshold" {
4848
description = "Maximum memory percentage allowed before triggering alert."
4949
default = 90
5050
}
51+
52+
variable "tags" {
53+
type = map(any)
54+
description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
55+
default = {}
56+
}

0 commit comments

Comments
 (0)