Skip to content

Commit

Permalink
Monitor More Endpoints (#4730)
Browse files Browse the repository at this point in the history
* Add more audit endpoints to newrelic monitoring.

Restructure endpoints template for more control over endpoint and method

* Homepage
* Audit submissions homepage
* All excel uploads (not downloads)
* PDF Uploads (not downloads)
* Cross val (just the actual validation from POST)
* Single summary report downloads (not multiple from search)

* Terraform linting is a thing, I guess

* Resolve copy paste error on endpoint method

* Remove redundant '%'

* Break endpoints into their own tabs, group by type

* Rename endpoint tabs

* Add quotes on the tab name
  • Loading branch information
jperson1 authored Feb 28, 2025
1 parent 01a012d commit afe51f5
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 17 deletions.
88 changes: 82 additions & 6 deletions terraform/shared/modules/newrelic/monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,97 @@ locals {
latency_sla = { critical = 1000, warning = 800 } # Average Latency over a week, in ms
})

endpoint_page = templatefile("${path.module}/endpoints.json.tftpl", {
healthcheck_pages = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
endpoint_config = [
page_name = "Healthcheck"
widgets_config = [
{ name = "UEI Validation"
uri = "/api/sac/ueivalidation"
method = "POST"
transactions_sla = { critical = 1, warning = 5 } # Number of Transactions per hour
success_rate_sla = { critical = 0.975, warning = 0.985 } # Success Rate Percentage
latency_sla = { critical = 1000, warning = 800 } # Average Latency over a week, in ms
},
{ name = "Homepage"
uri = "/"
method = "GET"
transactions_sla = { critical = 100, warning = 150 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 350, warning = 250 } # Average Latency over a week, in ms
},
{ name = "Audit Submissions Homepage"
uri = "/audit/"
method = "GET"
transactions_sla = { critical = 50, warning = 70 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 1500, warning = 1200 } # Average Latency over a week, in ms
},
]
})

file_uploads = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
page_name = "File Uploads"
widgets_config = [
{ name = "Workbook Uploads"
uri = "/audit/excel/%"
method = "POST"
transactions_sla = { critical = 20, warning = 25 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 12000, warning = 10000 } # Average Latency over a week, in ms
},
{ name = "PDF Uploads"
uri = "/audit/upload-report/%"
method = "POST"
transactions_sla = { critical = 10, warning = 20 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 15000, warning = 12000 } # Average Latency over a week, in ms
},
]
})

file_downloads = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
page_name = "File Downloads"
widgets_config = [
{ name = "Single Summary Report Download"
uri = "/dissemination/summary-report/xlsx%"
method = "GET"
transactions_sla = { critical = 70, warning = 80 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 800, warning = 700 } # Average Latency over a week, in ms
},
{ name = "Workbook Downloads"
uri = "/audit/excel/%"
method = "GET"
transactions_sla = { critical = 20, warning = 25 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 12000, warning = 10000 } # Average Latency over a week, in ms
},
]
})

submission_pages = templatefile("${path.module}/widgets.json.tftpl", {
env = var.cf_space_name
new_relic_account_id = var.new_relic_account_id
page_name = "Validation and Submission"
widgets_config = [
{ name = "Cross Validation"
uri = "/audit/cross-validation/%"
method = "POST"
transactions_sla = { critical = 8, warning = 10 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 800, warning = 650 } # Average Latency over a week, in ms
},
{ name = "Submit Audit"
uri = "/audit/submission"
transactions_sla = { critical = 100, warning = 250 } # Number of Transactions per hour
uri = "/audit/submission/%" # '/audit/submission%' was also catching '/audit/submission-progress'
method = "POST"
transactions_sla = { critical = 5, warning = 8 } # Number of Transactions per hour
success_rate_sla = { critical = 0.99, warning = 0.995 } # Success Rate Percentage
latency_sla = { critical = 500, warning = 450 } # Average Latency over a week, in ms
latency_sla = { critical = 1500, warning = 1200 } # Average Latency over a week, in ms
}
]
})
Expand All @@ -31,7 +107,7 @@ locals {
locals {
template_renderer = templatefile("${path.module}/monitoring_dashboard.json.tftpl", {
env = var.cf_space_name
pages = [local.high_level_page, local.endpoint_page]
pages = [local.high_level_page, local.healthcheck_pages, local.file_uploads, local.file_downloads, local.submission_pages]
})
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"name": "Endpoint Details",
"name": "${page_name}",
"description": null,
"widgets": [
%{~ for index, endpoint in endpoint_config ~}
%{~ for index, endpoint in widgets_config ~}
%{if index!=0},%{ endif }
{
"title": "",
Expand Down Expand Up @@ -38,7 +38,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT count(*) AS 'Transactions (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 1 hour ago"
"query": "FROM Transaction SELECT count(*) AS 'Transactions (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 1 hour ago"
}
]
}
Expand Down Expand Up @@ -70,7 +70,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT rate(count(*), 1 hour) AS 'Transactions (per hour)' WHERE appName ='gsa-fac-${env}' AND request.uri LIKE '${endpoint.uri}%' SINCE 7 days ago"
"query": "FROM Transaction SELECT rate(count(*), 1 hour) AS 'Transactions (per hour)' WHERE appName ='gsa-fac-${env}' AND request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}' SINCE 7 days ago"
}
],
"thresholds": [
Expand Down Expand Up @@ -122,7 +122,7 @@
"nrqlQueries": [
{
"accountId": ${new_relic_account_id},
"query": "FROM Transaction SELECT count(*) AS 'Transactions' WHERE appName = 'gsa-fac-${env}' and request.uri LIKE '${endpoint.uri}%' SINCE 14 days ago TIMESERIES"
"query": "FROM Transaction SELECT count(*) AS 'Transactions' WHERE appName = 'gsa-fac-${env}' AND request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}' SINCE 14 days ago TIMESERIES"
}
]
}
Expand All @@ -145,7 +145,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 1 hour ago"
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 1 hour ago"
}
]
}
Expand Down Expand Up @@ -176,7 +176,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT 100 - percentage(count(*), WHERE error is true) AS 'Success Rate' WHERE appName ='gsa-fac-${env}' AND request.uri LIKE '${endpoint.uri}%' SINCE 7 days ago "
"query": "FROM Transaction SELECT 100 - percentage(count(*), WHERE error is true) AS 'Success Rate' WHERE appName ='gsa-fac-${env}' AND request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}' SINCE 7 days ago "
}
],
"thresholds": [
Expand Down Expand Up @@ -228,7 +228,7 @@
"nrqlQueries": [
{
"accountId": ${new_relic_account_id},
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 14 days ago TIMESERIES"
"query": "FROM TransactionError SELECT count(*) AS 'Errors (last hour)' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 14 days ago TIMESERIES"
}
]
}
Expand Down Expand Up @@ -268,7 +268,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 1 hour ago"
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 1 hour ago"
}
],
"platformOptions": {
Expand Down Expand Up @@ -302,7 +302,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 7 days ago "
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 7 days ago "
}
],
"thresholds": [
Expand Down Expand Up @@ -362,7 +362,7 @@
"accountIds": [
${new_relic_account_id}
],
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.uri LIKE '${endpoint.uri}%') SINCE 14 days ago TIMESERIES"
"query": "FROM Transaction SELECT average(convert(duration, 's', 'ms')) AS 'Average', percentile(convert(duration, 's', 'ms'), 50) AS 'p50', percentile(convert(duration, 's', 'ms'), 95) AS 'p95' WHERE (appName = 'gsa-fac-${env}') AND (request.method = '${endpoint.method}' AND request.uri LIKE '${endpoint.uri}') SINCE 14 days ago TIMESERIES"
}
],
"platformOptions": {
Expand Down

0 comments on commit afe51f5

Please sign in to comment.