massgov · cs-ma · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023
@@ -11,58 +11,47 @@ locals {
 
 }
 
-resource "newrelic_nrql_alert_condition" "error_rate" {
+module "error_rate" {
+  source = "../nrql-alert"
+
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - Error Rate"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(aws.cloudfront.TotalErrorRate) FROM Metric ${local.filter_subquery} FACET entity.name"
-  }
-
-  critical {
-    operator = "above"
-    threshold = var.error_rate_threshold
-    threshold_duration = var.critical_threshold_duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
+  name = format(
+    "%s - Error rate over %s%% for at least %d seconds",
+    var.name_prefix,
+    replace(format("%f", var.error_rate_threshold), "/\\.0+$/", ""),
+    var.critical_threshold_duration
+  )
+
+  nrql_query = "SELECT average(aws.cloudfront.TotalErrorRate) FROM Metric ${local.filter_subquery} FACET entity.name"
+  critical_threshold = var.error_rate_threshold
+  critical_threshold_duration = var.critical_threshold_duration
   aggregation_window = var.aggregation_window
   aggregation_method = "event_flow"
   aggregation_delay = 120
-
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
 
-resource "newrelic_nrql_alert_condition" "throughput" {
+module "throughput" {
+  source = "../nrql-alert"
   count = (var.throughput_enabled ? 1 : 0)
 
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - Throughput"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(aws.cloudfront.Requests) FROM Metric ${local.filter_subquery} FACET entity.name"
-  }
-
-  critical {
-    operator = "below"
-    threshold = var.throughput_threshold
-    threshold_duration = var.critical_threshold_duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
+  name = format("%s - Less than %d requests per %d seconds for over %d seconds",
+    var.name_prefix,
+    var.throughput_threshold,
+    var.aggregation_window,
+    var.critical_threshold_duration
+  )
+
+  nrql_query = "SELECT average(aws.cloudfront.Requests) FROM Metric ${local.filter_subquery} FACET entity.name"
+
+  critical_operator = "below"
+  critical_threshold = var.throughput_threshold
+  critical_threshold_duration = var.critical_threshold_duration
   aggregation_window = var.aggregation_window
   aggregation_method = "event_flow"
   aggregation_delay = 120
-
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
@@ -54,3 +54,9 @@ variable "throughput_threshold" {
   description = "Minimum number of requests per minute before triggering throughput alert."
   default     = 5
 }
+
+variable "tags" {
+  type        = map(any)
+  description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
+  default = {}
+}
@@ -27,121 +27,96 @@ locals {
   duration = var.critical_threshold_duration == null ? local.default_duration : var.critical_threshold_duration
 }
 
-resource "newrelic_nrql_alert_condition" "alert" {
+
+module "cpu" {
+  source = "../nrql-alert"
+
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - CPU"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET aws.ec2.InstanceId"
-  }
-
-  critical {
-    operator = "above"
-    threshold = var.critical_threshold
-    threshold_duration = local.duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
+  name = format(
+    "%s - CPU utilization over %s%% for at least %d seconds",
+    var.name_prefix,
+    replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
+    local.duration
+  )
+
+  nrql_query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET aws.ec2.InstanceId"
+  critical_threshold = var.critical_threshold
+  critical_threshold_duration = local.duration
   aggregation_window = local.window
   aggregation_method = "event_timer"
   aggregation_timer = local.timer
-  expiration_duration = 600
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
 
-resource "newrelic_nrql_alert_condition" "loss_of_signal" {
+module "loss_of_signal" {
   count = (var.alert_loss_of_signal ? 1 : 0)
+  source = "../nrql-alert"
 
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - Loss of Signal"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(aws.ec2.CPUUtilization) FROM Metric ${local.filter_subquery} FACET tags.Name"
-  }
-
-  critical {
-    operator = "above"
-    # This should never actually trigger, since CPUUtilization is a percent.
-    # We don't care about this condition, we're just using this alert to use
-    # the "open_violation_on_expiration" parameter to detect signal loss (by
-    # instance name instead of instance id). Otherwise, every instance refresh
-    # causes alerts/an "incident" in NR.
-    threshold = 101
-    threshold_duration = local.duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
+  name = format(
+    "%s - No metrics reported for at least %d seconds",
+    var.name_prefix,
+    600
+  )
+
+  nrql_query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET tags.Name"
+  # This should never actually trigger, since CPUUtilization is a percent.
+  # We don't care about this condition, we're just using this alert to use
+  # the "open_violation_on_expiration" parameter to detect signal loss (by
+  # instance name instead of instance id). Otherwise, every instance refresh
+  # causes alerts/an "incident" in NR.
+  critical_threshold = 101
+  critical_threshold_duration = local.duration
   aggregation_window = local.window
   aggregation_method = "event_timer"
   aggregation_timer = local.timer
   expiration_duration = 600
   open_violation_on_expiration = true
-  close_violations_on_expiration = false
+  tags = var.tags
 }
 
-resource "newrelic_nrql_alert_condition" "memory" {
+module "memory" {
   count = (var.use_agent_metrics ? 1 : 0)
+  source = "../nrql-alert"
 
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - Memory"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(memoryUsedPercent) FROM SystemSample ${local.filter_subquery} FACET aws.ec2.InstanceId"
-  }
-
-  critical {
-    operator = "above"
-    threshold = var.critical_threshold
-    threshold_duration = local.duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
-  aggregation_window = local.window
+  name = format(
+    "%s - Memory usage over %s%% for at least %d seconds",
+    var.name_prefix,
+    replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
+    local.duration
+  )
+
+  nrql_query = "SELECT average(memoryUsedPercent) FROM SystemSample ${local.filter_subquery} FACET aws.ec2.InstanceId"
+  critical_threshold = var.critical_threshold
+  critical_threshold_duration = local.duration
   aggregation_method = "event_timer"
+  aggregation_window = local.window
   aggregation_timer = local.timer
-  expiration_duration = 600
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
 
-resource "newrelic_nrql_alert_condition" "storage" {
+module "storage" {
   count = (var.use_agent_metrics ? 1 : 0)
+  source = "../nrql-alert"
 
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - Storage"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(diskUsedPercent) FROM StorageSample ${local.filter_subquery} FACET `tags.Name`, mountPoint"
-  }
-
-  critical {
-    operator = "above"
-    threshold = var.critical_threshold
-    threshold_duration = local.duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
-  aggregation_window = local.window
+  name = format(
+    "%s - Storage usage over %s%% for at least %d seconds",
+    var.name_prefix,
+    replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
+    local.duration
+  )
+
+  nrql_query = "SELECT average(diskUsedPercent) FROM StorageSample ${local.filter_subquery} FACET `tags.Name`, mountPoint"
+  critical_threshold = var.critical_threshold
+  critical_threshold_duration = local.duration
   aggregation_method = "event_timer"
+  aggregation_window = local.window
   aggregation_timer = local.timer
-  expiration_duration = 600
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
@@ -60,3 +60,9 @@ variable "use_agent_metrics" {
   description = "Build the alerts using the extended metrics generated by the New Relic EC2 agent."
   default     = false
 }
+
+variable "tags" {
+  type        = map(any)
+  description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
+  default = {}
+}
@@ -8,59 +8,47 @@ locals {
   filter_subqueries_and = join(" AND ", compact([local.aws_accounts_subquery, local.cluster_names_subquery]))
 
   filter_subquery = length(local.filter_subqueries_and) == 0 ? "" : "WHERE (${local.filter_subqueries_and})"
-
 }
 
-resource "newrelic_nrql_alert_condition" "cpu" {
+module "cpu" {
+  source = "../nrql-alert"
+
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - CPU"
-  enabled = true
-  violation_time_limit_seconds = 259200
+  name = format(
+    "%s - CPU utilization over %s%% for at least %d seconds",
+    var.name_prefix,
+    replace(format("%f", var.cpu_threshold), "/\\.0+$/", ""),
+    var.critical_threshold_duration
+  )
 
-  nrql {
-    query = "SELECT average(aws.ecs.CPUUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
-  }
+  nrql_query = "SELECT average(aws.ecs.CPUUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
 
-  critical {
-    operator = "above"
-    threshold = var.cpu_threshold
-    threshold_duration = var.critical_threshold_duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
+  critical_threshold = var.cpu_threshold
+  critical_threshold_duration = var.critical_threshold_duration
   aggregation_window = var.aggregation_window
   aggregation_method = "event_flow"
   aggregation_delay = 120
-
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
 
-resource "newrelic_nrql_alert_condition" "memory" {
+module "memory" {
+  source = "../nrql-alert"
+
   account_id = var.account_id
   policy_id = var.alert_policy_id
-  type = "static"
-  name = "${var.name_prefix} - Memory"
-  enabled = true
-  violation_time_limit_seconds = 259200
-
-  nrql {
-    query = "SELECT average(aws.ecs.MemoryUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
-  }
-
-  critical {
-    operator = "above"
-    threshold = var.memory_threshold
-    threshold_duration = var.critical_threshold_duration
-    threshold_occurrences = "all"
-  }
-  fill_option = "none"
+  name = format(
+    "%s - Memory usage over %s%% for at least %d seconds",
+    var.name_prefix,
+    replace(format("%f", var.memory_threshold), "/\\.0+$/", ""),
+    var.critical_threshold_duration
+  )
+
+  nrql_query = "SELECT average(aws.ecs.MemoryUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
+  critical_threshold = var.memory_threshold
+  critical_threshold_duration = var.critical_threshold_duration
   aggregation_window = var.aggregation_window
   aggregation_method = "event_flow"
   aggregation_delay = 120
-
-  open_violation_on_expiration = false
-  close_violations_on_expiration = false
+  tags = var.tags
 }
@@ -48,3 +48,9 @@ variable "memory_threshold" {
   description = "Maximum memory percentage allowed before triggering alert."
   default     = 90
 }
+
+variable "tags" {
+  type        = map(any)
+  description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
+  default = {}
+}