Skip to content

Commit ccc10c5

Browse files
authored
Refactor slo module. Update monitor-based SLOs to be able to create corresponding monitors (cloudposse#69)
* Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module * Update SLO module
1 parent cccc1a5 commit ccc10c5

File tree

10 files changed

+320
-174
lines changed

10 files changed

+320
-174
lines changed

examples/slo/catalog/synthetic.yaml examples/slo/catalog/metric_slo.yaml

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
synthetics-slo:
1+
metric-slo:
22
name: "(SLO) Synthetic Checks"
33
type: metric
44
query:
@@ -17,9 +17,7 @@ synthetics-slo:
1717
- target: "99"
1818
timeframe: "30d"
1919
warning: "99.5"
20-
groups: []
21-
monitor_ids: []
2220
tags:
23-
managedby: terraform
21+
ManagedBy: terraform
2422
test: true
2523
api_version: null

examples/slo/catalog/monitor_slo.yaml

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
monitor-slo:
2+
name: "(SLO) EC2 Availability"
3+
type: monitor
4+
description: |
5+
Number of EC2 failed status checks.
6+
message: |
7+
({stage} {region}) {instance_id} failed a SLO check
8+
force_delete: true
9+
validate: true
10+
thresholds:
11+
- target: "99.5"
12+
timeframe: "7d"
13+
warning: "99.9"
14+
- target: "99"
15+
timeframe: "30d"
16+
warning: "99.5"
17+
# Either `monitor_ids` or `monitors` should be provided
18+
# `monitor_ids` is a list of externally created monitors to use for this monitor-based SLO
19+
# If `monitors` map is provided, the monitors will be created by the module and assigned to the SLO
20+
monitor_ids: null
21+
monitors:
22+
ec2-failed-status-check:
23+
name: "(EC2) Status Check"
24+
type: metric alert
25+
query: |
26+
avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0
27+
message: |
28+
({stage} {region}) {instance_id} failed a status check
29+
escalation_message: ""
30+
tags:
31+
ManagedBy: Terraform
32+
priority: 3
33+
notify_no_data: false
34+
notify_audit: true
35+
require_full_window: true
36+
enable_logs_sample: false
37+
force_delete: true
38+
include_tags: true
39+
locked: false
40+
renotify_interval: 60
41+
timeout_h: 0
42+
evaluation_delay: 60
43+
new_host_delay: 300
44+
new_group_delay: 0
45+
groupby_simple_monitor: false
46+
renotify_occurrences: 0
47+
renotify_statuses: []
48+
validate: true
49+
no_data_timeframe: 10
50+
threshold_windows: {}
51+
thresholds:
52+
critical: 0
53+
tags:
54+
ManagedBy: terraform
55+
test: true
56+
api_version: null

examples/slo/outputs.tf

+14-9
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1-
output "datadog_metric_slos" {
2-
value = module.datadog_slo.datadog_metric_slos
3-
description = "Map of created Metric Based SLOs"
4-
}
5-
61
output "datadog_monitor_slos" {
72
value = module.datadog_slo.datadog_monitor_slos
8-
description = "Map of created Monitor Based SLOs"
3+
description = "Map of created monitor-based SLOs"
4+
}
5+
6+
output "datadog_monitor_slo_monitors" {
7+
value = module.datadog_slo.datadog_monitor_slo_monitors
8+
description = "Created monitors for the monitor-based SLOs"
9+
}
10+
11+
output "datadog_metric_slos" {
12+
value = module.datadog_slo.datadog_metric_slos
13+
description = "Map of created metric-based SLOs"
914
}
1015

11-
output "datadog_slo_alerts" {
12-
value = module.datadog_slo.datadog_slo_alerts
13-
description = "Map of created SLO Based Alerts"
16+
output "datadog_metric_slo_alerts" {
17+
value = module.datadog_slo.datadog_metric_slo_alerts
18+
description = "Map of created metric-based SLO alerts"
1419
}

modules/slo/README.md

+80-17
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
# Datadog SLO
22

3-
This module is responsible for creating Datadog [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) and their related alerts.
3+
This module is responsible for creating Datadog [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) and their related monitors and alerts.
4+
5+
The module can create metric-based SLOs (and the corresponding alerts) and monitor-based SLOs (and the corresponding monitors).
46

57
## Alerts
6-
Datadog Alerts for SLOs are terraformed through the monitor object.
8+
9+
Datadog alerts for SLOs are terraformed through the monitor object.
710

811
An SLO can have many thresholds set, but a monitor can only have one. In order to get around this, the module creates Datadog monitors for each threshold within an SLO.
912

10-
For example
13+
## Usage
14+
15+
Example of metric-based SLO:
1116

1217
```yaml
13-
synthetics-slo:
18+
metric-slo:
1419
name: "(SLO) Synthetic Checks"
1520
type: metric
1621
query:
@@ -23,23 +28,81 @@ synthetics-slo:
2328
force_delete: true
2429
validate: true
2530
thresholds:
26-
- target: "99.5"
27-
target_display: "99.50"
28-
timeframe: "7d"
29-
warning: "99.9"
30-
warning_display: "99.90"
31-
- target: "99"
32-
target_display: "99.00"
33-
timeframe: "30d"
34-
warning: "99.5"
35-
warning_display: "99.50"
36-
groups: []
37-
monitor_ids: []
31+
- target: "99.5"
32+
timeframe: "7d"
33+
warning: "99.9"
34+
- target: "99"
35+
timeframe: "30d"
36+
warning: "99.5"
3837
tags:
39-
managedby: terraform
38+
ManagedBy: terraform
39+
test: true
40+
api_version: null
41+
```
4042
43+
Example of monitor-based SLO:
44+
45+
```yaml
46+
monitor-slo:
47+
name: "(SLO) EC2 Availability"
48+
type: monitor
49+
description: |
50+
Number of EC2 failed status checks.
51+
message: |
52+
({stage} {region}) {instance_id} failed a SLO check
53+
force_delete: true
54+
validate: true
55+
thresholds:
56+
- target: "99.5"
57+
timeframe: "7d"
58+
warning: "99.9"
59+
- target: "99"
60+
timeframe: "30d"
61+
warning: "99.5"
62+
# Either `monitor_ids` or `monitors` should be provided
63+
# `monitor_ids` is a list of externally created monitors to use for this monitor-based SLO
64+
# If `monitors` map is provided, the monitors will be created by the module and assigned to the SLO
65+
monitor_ids: null
66+
monitors:
67+
ec2-failed-status-check:
68+
name: "(EC2) Status Check"
69+
type: metric alert
70+
query: |
71+
avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0
72+
message: |
73+
({stage} {region}) {instance_id} failed a status check
74+
escalation_message: ""
75+
tags:
76+
ManagedBy: Terraform
77+
priority: 3
78+
notify_no_data: false
79+
notify_audit: true
80+
require_full_window: true
81+
enable_logs_sample: false
82+
force_delete: true
83+
include_tags: true
84+
locked: false
85+
renotify_interval: 60
86+
timeout_h: 0
87+
evaluation_delay: 60
88+
new_host_delay: 300
89+
new_group_delay: 0
90+
groupby_simple_monitor: false
91+
renotify_occurrences: 0
92+
renotify_statuses: []
93+
validate: true
94+
no_data_timeframe: 10
95+
threshold_windows: {}
96+
thresholds:
97+
critical: 0
98+
tags:
99+
ManagedBy: terraform
100+
test: true
101+
api_version: null
41102
```
42103
43104
## References
44105
- [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/)
106+
- [Monitor-based SLOs](https://docs.datadoghq.com/monitors/service_level_objectives/monitor/)
45107
- [Datadog Error Budget](https://docs.datadoghq.com/monitors/service_level_objectives/error_budget/)
108+
- [Monitor-based SLO example](https://github.com/DataDog/terraform-provider-datadog/issues/667)

modules/slo/main.tf

-119
Original file line numberDiff line numberDiff line change
@@ -1,124 +1,5 @@
11
locals {
22
enabled = module.this.enabled
33

4-
datadog_monitor_slos = { for slo in var.datadog_slos : slo.name => slo if slo.type == "monitor" && lookup(slo, "enabled", true) && local.enabled }
5-
datadog_metric_slos = { for slo in var.datadog_slos : slo.name => slo if slo.type == "metric" && lookup(slo, "enabled", true) && local.enabled }
6-
7-
temp_datadog_slo_metric_monitors = flatten([
8-
for name, slo in var.datadog_slos : [
9-
for i, threshold in slo.thresholds : {
10-
slo = slo,
11-
slo_name = format("%s_threshold%s", name, i)
12-
threshold = threshold
13-
}
14-
if slo.type == "metric" && local.enabled && lookup(slo, "enabled", true)
15-
]
16-
])
17-
18-
datadog_slo_metric_monitors = { for monitor in local.temp_datadog_slo_metric_monitors : monitor.slo_name => monitor }
19-
204
alert_tags = local.enabled && var.alert_tags != null ? format("%s%s", var.alert_tags_separator, join(var.alert_tags_separator, var.alert_tags)) : ""
215
}
22-
23-
resource "datadog_service_level_objective" "monitor_slo" {
24-
for_each = local.datadog_monitor_slos
25-
26-
# Required
27-
name = each.value.name
28-
type = each.value.type
29-
30-
dynamic "thresholds" {
31-
for_each = each.value.thresholds
32-
content {
33-
target = lookup(thresholds, "target", "99.00")
34-
timeframe = lookup(thresholds, "timeframe", "7d")
35-
36-
target_display = lookup(thresholds, "target_display", "98.00")
37-
warning = lookup(thresholds, "warning", "99.95")
38-
warning_display = lookup(thresholds, "warning_display", "98.00")
39-
}
40-
}
41-
42-
groups = lookup(each.value, "groups", [])
43-
monitor_ids = each.value.monitor_ids
44-
45-
# Optional
46-
description = lookup(each.value, "description", null)
47-
force_delete = lookup(each.value, "force_delete", true)
48-
validate = lookup(each.value, "validate", false)
49-
50-
# Convert terraform tags map to Datadog tags map
51-
# If a key is supplied with a value, it will render "key:value" as a tag
52-
# tags:
53-
# key: value
54-
# If a key is supplied without a value (null), it will render "key" as a tag
55-
# tags:
56-
# key: null
57-
tags = [
58-
for tagk, tagv in lookup(each.value, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
59-
]
60-
}
61-
62-
resource "datadog_service_level_objective" "metric_slo" {
63-
for_each = local.datadog_metric_slos
64-
65-
# Required
66-
name = each.value.name
67-
type = each.value.type
68-
69-
query {
70-
denominator = each.value.query.denominator
71-
numerator = each.value.query.numerator
72-
}
73-
74-
# Optional
75-
description = lookup(each.value, "description", null)
76-
force_delete = lookup(each.value, "force_delete", true)
77-
validate = lookup(each.value, "validate", false)
78-
79-
dynamic "thresholds" {
80-
for_each = each.value.thresholds
81-
content {
82-
target = lookup(thresholds.value, "target", null)
83-
timeframe = lookup(thresholds.value, "timeframe", null)
84-
warning = lookup(thresholds.value, "warning", null)
85-
}
86-
}
87-
88-
# Convert terraform tags map to Datadog tags map
89-
# If a key is supplied with a value, it will render "key:value" as a tag
90-
# tags:
91-
# key: value
92-
# If a key is supplied without a value (null), it will render "key" as a tag
93-
# tags:
94-
# key: null
95-
tags = [
96-
for tagk, tagv in lookup(each.value, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
97-
]
98-
}
99-
100-
resource "datadog_monitor" "metric_slo_alert" {
101-
for_each = local.datadog_slo_metric_monitors
102-
103-
name = format("(SLO Error Budget Alert) %s", each.value.slo.name)
104-
type = "slo alert"
105-
message = format("%s%s", each.value.slo.message, local.alert_tags)
106-
107-
query = <<EOF
108-
error_budget("${datadog_service_level_objective.metric_slo[each.value.slo.name].id}").over("${each.value.threshold.timeframe}") > ${lookup(each.value.threshold, "target", "99.00")}
109-
EOF
110-
monitor_thresholds {
111-
critical = lookup(each.value.threshold, "target", null)
112-
}
113-
114-
# Convert terraform tags map to Datadog tags map
115-
# If a key is supplied with a value, it will render "key:value" as a tag
116-
# tags:
117-
# key: value
118-
# If a key is supplied without a value (null), it will render "key" as a tag
119-
# tags:
120-
# key: null
121-
tags = [
122-
for tagk, tagv in lookup(each.value.slo, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
123-
]
124-
}

0 commit comments

Comments
 (0)