Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor More Endpoints #4730

Merged
merged 9 commits into from
Feb 28, 2025
Merged
88 changes: 82 additions & 6 deletions terraform/shared/modules/newrelic/monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,97 @@ locals {
latency_sla = { critical = 1000, warning = 800 } # Average Latency over a week, in ms
})

endpoint_page = templatefile("${path.module}/endpoints.json.tftpl", {
healthcheck_pages = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
endpoint_config = [
page_name = "Healthcheck"
widgets_config = [
{ name = "UEI Validation"
uri = "/api/sac/ueivalidation"
method = "POST"
transactions_sla = { critical = 1, warning = 5 } # Number of Transactions per hour
success_rate_sla = { critical = 0.975, warning = 0.985 } # Success Rate Percentage
latency_sla = { critical = 1000, warning = 800 } # Average Latency over a week, in ms
},
{ name = "Homepage"
uri = "/"
method = "GET"
transactions_sla = { critical = 100, warning = 150 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 350, warning = 250 } # Average Latency over a week, in ms
},
{ name = "Audit Submissions Homepage"
uri = "/audit/"
method = "GET"
transactions_sla = { critical = 50, warning = 70 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 1500, warning = 1200 } # Average Latency over a week, in ms
},
]
})

file_uploads = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
page_name = "File Uploads"
widgets_config = [
{ name = "Workbook Uploads"
uri = "/audit/excel/%"
method = "POST"
transactions_sla = { critical = 20, warning = 25 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 12000, warning = 10000 } # Average Latency over a week, in ms
},
{ name = "PDF Uploads"
uri = "/audit/upload-report/%"
method = "POST"
transactions_sla = { critical = 10, warning = 20 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 15000, warning = 12000 } # Average Latency over a week, in ms
},
]
})

file_downloads = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
page_name = "File Downloads"
widgets_config = [
{ name = "Single Summary Report Download"
uri = "/dissemination/summary-report/xlsx%"
method = "GET"
transactions_sla = { critical = 70, warning = 80 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 800, warning = 700 } # Average Latency over a week, in ms
},
{ name = "Workbook Downloads"
uri = "/audit/excel/%"
method = "GET"
transactions_sla = { critical = 20, warning = 25 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 12000, warning = 10000 } # Average Latency over a week, in ms
},
]
})

submission_pages = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
page_name = "Validation and Submission"
widgets_config = [
{ name = "Cross Validation"
uri = "/audit/cross-validation/%"
method = "POST"
transactions_sla = { critical = 8, warning = 10 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 800, warning = 650 } # Average Latency over a week, in ms
},
{ name = "Submit Audit"
uri = "/audit/submission"
transactions_sla = { critical = 100, warning = 250 } # Number of Transactions per hour
uri = "/audit/submission/%" # '/audit/submission%' was also catching '/audit/submission-progress'
method = "POST"
transactions_sla = { critical = 5, warning = 8 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 500, warning = 450 } # Average Latency over a week, in ms
latency_sla = { critical = 1500, warning = 1200 } # Average Latency over a week, in ms
}
]
})
Expand All @@ -31,7 +107,7 @@ locals {
locals {
template_renderer = templatefile("${path.module}/monitoring_dashboard.json.tftpl", {
env = var.cf_space_name
pages = [local.high_level_page, local.endpoint_page]
pages = [local.high_level_page, local.healthcheck_pages, local.file_uploads, local.file_downloads, local.submission_pages]
})
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"name": "Endpoint Details",
"name": "${page_name}",
"description": null,
"widgets": [
%{~ for index, endpoint in endpoint_config ~}
%{~ for index, endpoint in widgets_config ~}
%{if index!=0},%{ endif }
{
"title": "",
Expand Down Expand Up @@ -38,7 +38,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT count(*) AS 'Transactions (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 1 hour ago"
"query": "FROM Transaction SELECT count(*) AS 'Transactions (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 1 hour ago"
}
]
}
Expand Down Expand Up @@ -70,7 +70,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT rate(count(*), 1 hour) AS 'Transactions (per hour)' WHERE appName ='gsa-fac-${env}' AND request.uri LIKE '${endpoint.uri}%' SINCE 7 days ago"
"query": "FROM Transaction SELECT rate(count(*), 1 hour) AS 'Transactions (per hour)' WHERE appName ='gsa-fac-${env}' AND request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}' SINCE 7 days ago"
}
],
"thresholds": [
Expand Down Expand Up @@ -122,7 +122,7 @@
"nrqlQueries": [
{
"accountId": ${new_relic_account_id},
"query": "FROM Transaction SELECT count(*) AS 'Transactions' WHERE appName = 'gsa-fac-${env}' and request.uri LIKE '${endpoint.uri}%' SINCE 14 days ago TIMESERIES"
"query": "FROM Transaction SELECT count(*) AS 'Transactions' WHERE appName = 'gsa-fac-${env}' AND request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}' SINCE 14 days ago TIMESERIES"
}
]
}
Expand All @@ -145,7 +145,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 1 hour ago"
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 1 hour ago"
}
]
}
Expand Down Expand Up @@ -176,7 +176,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT 100 - percentage(count(*), WHERE error is true) AS 'Success Rate' WHERE appName ='gsa-fac-${env}' AND request.uri LIKE '${endpoint.uri}%' SINCE 7 days ago "
"query": "FROM Transaction SELECT 100 - percentage(count(*), WHERE error is true) AS 'Success Rate' WHERE appName ='gsa-fac-${env}' AND request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}' SINCE 7 days ago "
}
],
"thresholds": [
Expand Down Expand Up @@ -228,7 +228,7 @@
"nrqlQueries": [
{
"accountId": ${new_relic_account_id},
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 14 days ago TIMESERIES"
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 14 days ago TIMESERIES"
}
]
}
Expand Down Expand Up @@ -268,7 +268,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 1 hour ago"
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 1 hour ago"
}
],
"platformOptions": {
Expand Down Expand Up @@ -302,7 +302,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 7 days ago "
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 7 days ago "
}
],
"thresholds": [
Expand Down Expand Up @@ -362,7 +362,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 14 days ago TIMESERIES"
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 14 days ago TIMESERIES"
}
],
"platformOptions": {
Expand Down
Loading