Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 29 additions & 40 deletions newrelic/alert-conditions-cloudfront/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,58 +11,47 @@ locals {

}

resource "newrelic_nrql_alert_condition" "error_rate" {
module "error_rate" {
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - Error Rate"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(aws.cloudfront.TotalErrorRate) FROM Metric ${local.filter_subquery} FACET entity.name"
}

critical {
operator = "above"
threshold = var.error_rate_threshold
threshold_duration = var.critical_threshold_duration
threshold_occurrences = "all"
}
fill_option = "none"
name = format(
"%s - Error rate over %s%% for at least %d seconds",
var.name_prefix,
replace(format("%f", var.error_rate_threshold), "/\\.0+$/", ""),
var.critical_threshold_duration
)

nrql_query = "SELECT average(aws.cloudfront.TotalErrorRate) FROM Metric ${local.filter_subquery} FACET entity.name"
critical_threshold = var.error_rate_threshold
critical_threshold_duration = var.critical_threshold_duration
aggregation_window = var.aggregation_window
aggregation_method = "event_flow"
aggregation_delay = 120

open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}

resource "newrelic_nrql_alert_condition" "throughput" {
module "throughput" {
source = "../nrql-alert"
count = (var.throughput_enabled ? 1 : 0)

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - Throughput"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(aws.cloudfront.Requests) FROM Metric ${local.filter_subquery} FACET entity.name"
}

critical {
operator = "below"
threshold = var.throughput_threshold
threshold_duration = var.critical_threshold_duration
threshold_occurrences = "all"
}
fill_option = "none"
name = format("%s - Less than %d requests per %d seconds for over %d seconds",
var.name_prefix,
var.throughput_threshold,
var.aggregation_window,
var.critical_threshold_duration
)

nrql_query = "SELECT average(aws.cloudfront.Requests) FROM Metric ${local.filter_subquery} FACET entity.name"

critical_operator = "below"
critical_threshold = var.throughput_threshold
critical_threshold_duration = var.critical_threshold_duration
aggregation_window = var.aggregation_window
aggregation_method = "event_flow"
aggregation_delay = 120

open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}
6 changes: 6 additions & 0 deletions newrelic/alert-conditions-cloudfront/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,9 @@ variable "throughput_threshold" {
description = "Minimum number of requests per minute before triggering throughput alert."
default = 5
}

variable "tags" {
type = map(any)
description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
default = {}
}
145 changes: 60 additions & 85 deletions newrelic/alert-conditions-ec2/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,121 +27,96 @@ locals {
duration = var.critical_threshold_duration == null ? local.default_duration : var.critical_threshold_duration
}

resource "newrelic_nrql_alert_condition" "alert" {

module "cpu" {
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - CPU"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET aws.ec2.InstanceId"
}

critical {
operator = "above"
threshold = var.critical_threshold
threshold_duration = local.duration
threshold_occurrences = "all"
}
fill_option = "none"
name = format(
"%s - CPU utilization over %s%% for at least %d seconds",
var.name_prefix,
replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
local.duration
)

nrql_query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET aws.ec2.InstanceId"
critical_threshold = var.critical_threshold
critical_threshold_duration = local.duration
aggregation_window = local.window
aggregation_method = "event_timer"
aggregation_timer = local.timer
expiration_duration = 600
open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}

resource "newrelic_nrql_alert_condition" "loss_of_signal" {
module "loss_of_signal" {
count = (var.alert_loss_of_signal ? 1 : 0)
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - Loss of Signal"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(aws.ec2.CPUUtilization) FROM Metric ${local.filter_subquery} FACET tags.Name"
}

critical {
operator = "above"
# This should never actually trigger, since CPUUtilization is a percent.
# We don't care about this condition, we're just using this alert to use
# the "open_violation_on_expiration" parameter to detect signal loss (by
# instance name instead of instance id). Otherwise, every instance refresh
# causes alerts/an "incident" in NR.
threshold = 101
threshold_duration = local.duration
threshold_occurrences = "all"
}
fill_option = "none"
name = format(
"%s - No metrics reported for at least %d seconds",
var.name_prefix,
600
)

nrql_query = "SELECT average(${local.metric_name}) FROM ${local.table_name} ${local.filter_subquery} FACET tags.Name"
# This should never actually trigger, since CPUUtilization is a percent.
# We don't care about this condition, we're just using this alert to use
# the "open_violation_on_expiration" parameter to detect signal loss (by
# instance name instead of instance id). Otherwise, every instance refresh
# causes alerts/an "incident" in NR.
critical_threshold = 101
critical_threshold_duration = local.duration
aggregation_window = local.window
aggregation_method = "event_timer"
aggregation_timer = local.timer
expiration_duration = 600
open_violation_on_expiration = true
close_violations_on_expiration = false
tags = var.tags
}

resource "newrelic_nrql_alert_condition" "memory" {
module "memory" {
count = (var.use_agent_metrics ? 1 : 0)
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - Memory"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(memoryUsedPercent) FROM SystemSample ${local.filter_subquery} FACET aws.ec2.InstanceId"
}

critical {
operator = "above"
threshold = var.critical_threshold
threshold_duration = local.duration
threshold_occurrences = "all"
}
fill_option = "none"
aggregation_window = local.window
name = format(
"%s - Memory usage over %s%% for at least %d seconds",
var.name_prefix,
replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
local.duration
)

nrql_query = "SELECT average(memoryUsedPercent) FROM SystemSample ${local.filter_subquery} FACET aws.ec2.InstanceId"
critical_threshold = var.critical_threshold
critical_threshold_duration = local.duration
aggregation_method = "event_timer"
aggregation_window = local.window
aggregation_timer = local.timer
expiration_duration = 600
open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}

resource "newrelic_nrql_alert_condition" "storage" {
module "storage" {
count = (var.use_agent_metrics ? 1 : 0)
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - Storage"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(diskUsedPercent) FROM StorageSample ${local.filter_subquery} FACET `tags.Name`, mountPoint"
}

critical {
operator = "above"
threshold = var.critical_threshold
threshold_duration = local.duration
threshold_occurrences = "all"
}
fill_option = "none"
aggregation_window = local.window
name = format(
"%s - Storage usage over %s%% for at least %d seconds",
var.name_prefix,
replace(format("%f", var.critical_threshold), "/\\.0+$/", ""),
local.duration
)

nrql_query = "SELECT average(diskUsedPercent) FROM StorageSample ${local.filter_subquery} FACET `tags.Name`, mountPoint"
critical_threshold = var.critical_threshold
critical_threshold_duration = local.duration
aggregation_method = "event_timer"
aggregation_window = local.window
aggregation_timer = local.timer
expiration_duration = 600
open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}
6 changes: 6 additions & 0 deletions newrelic/alert-conditions-ec2/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,9 @@ variable "use_agent_metrics" {
description = "Build the alerts using the extended metrics generated by the New Relic EC2 agent."
default = false
}

variable "tags" {
type = map(any)
description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
default = {}
}
66 changes: 27 additions & 39 deletions newrelic/alert-conditions-ecs-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,59 +8,47 @@ locals {
filter_subqueries_and = join(" AND ", compact([local.aws_accounts_subquery, local.cluster_names_subquery]))

filter_subquery = length(local.filter_subqueries_and) == 0 ? "" : "WHERE (${local.filter_subqueries_and})"

}

resource "newrelic_nrql_alert_condition" "cpu" {
module "cpu" {
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - CPU"
enabled = true
violation_time_limit_seconds = 259200
name = format(
"%s - CPU utilization over %s%% for at least %d seconds",
var.name_prefix,
replace(format("%f", var.cpu_threshold), "/\\.0+$/", ""),
var.critical_threshold_duration
)

nrql {
query = "SELECT average(aws.ecs.CPUUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
}
nrql_query = "SELECT average(aws.ecs.CPUUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"

critical {
operator = "above"
threshold = var.cpu_threshold
threshold_duration = var.critical_threshold_duration
threshold_occurrences = "all"
}
fill_option = "none"
critical_threshold = var.cpu_threshold
critical_threshold_duration = var.critical_threshold_duration
aggregation_window = var.aggregation_window
aggregation_method = "event_flow"
aggregation_delay = 120

open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}

resource "newrelic_nrql_alert_condition" "memory" {
module "memory" {
source = "../nrql-alert"

account_id = var.account_id
policy_id = var.alert_policy_id
type = "static"
name = "${var.name_prefix} - Memory"
enabled = true
violation_time_limit_seconds = 259200

nrql {
query = "SELECT average(aws.ecs.MemoryUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
}

critical {
operator = "above"
threshold = var.memory_threshold
threshold_duration = var.critical_threshold_duration
threshold_occurrences = "all"
}
fill_option = "none"
name = format(
"%s - Memory usage over %s%% for at least %d seconds",
var.name_prefix,
replace(format("%f", var.memory_threshold), "/\\.0+$/", ""),
var.critical_threshold_duration
)

nrql_query = "SELECT average(aws.ecs.MemoryUtilization.byCluster) FROM Metric ${local.filter_subquery} FACET aws.ecs.ClusterName"
critical_threshold = var.memory_threshold
critical_threshold_duration = var.critical_threshold_duration
aggregation_window = var.aggregation_window
aggregation_method = "event_flow"
aggregation_delay = 120

open_violation_on_expiration = false
close_violations_on_expiration = false
tags = var.tags
}
6 changes: 6 additions & 0 deletions newrelic/alert-conditions-ecs-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,9 @@ variable "memory_threshold" {
description = "Maximum memory percentage allowed before triggering alert."
default = 90
}

variable "tags" {
type = map(any)
description = "Tags to apply to the alert conditions. Tag values can either be a single string or a list of strings."
default = {}
}
Loading