Skip to content

Commit ed84f01

Browse files
authored
Merge pull request #777 from cloudflare/gaps
Fix alerts/count gap detection
2 parents 02875ec + 15a0d03 commit ed84f01

24 files changed

+35
-27
lines changed

cmd/pint/tests/0037_disable_checks.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmp stderr stderr.txt
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
88
level=DEBUG msg="File parsed" path=rules/0001.yml rules=3
9-
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
9+
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=[] include=[] exclude=[]
1010
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1 workers=16
1111
level=DEBUG msg="Generated all Prometheus servers" count=1
1212
level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=default-for lines=1-3

cmd/pint/tests/0039_prom_selected_path.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmp stderr stderr.txt
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
88
level=DEBUG msg="File parsed" path=rules/0001.yml rules=3
9-
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 tags=[] include=["^invalid/.+$"] exclude=["^invalid/rules/.+$"]
9+
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 uptime=up tags=[] include=["^invalid/.+$"] exclude=["^invalid/rules/.+$"]
1010
level=DEBUG msg="Starting query workers" name=disabled uri=http://127.0.0.1:123 workers=16
1111
level=DEBUG msg="Generated all Prometheus servers" count=1
1212
level=DEBUG msg="Found alerting rule" path=rules/0001.yml alert=first lines=1-3

cmd/pint/tests/0063_lint_offline.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ cmp stderr stderr.txt
55
-- stderr.txt --
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
8-
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 tags=[] include=["^invalid/.+$"] exclude=[]
8+
level=INFO msg="Configured new Prometheus server" name=disabled uris=1 uptime=up tags=[] include=["^invalid/.+$"] exclude=[]
99
-- rules/ok.yml --
1010
- record: sum:foo
1111
expr: sum(foo)

cmd/pint/tests/0080_lint_online.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ cmp stderr stderr.txt
1212
-- stderr.txt --
1313
level=INFO msg="Loading configuration file" path=.pint.hcl
1414
level=INFO msg="Finding all rules to check" paths=["rules"]
15-
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=[] exclude=[]
15+
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=prometheus_ready tags=[] include=[] exclude=[]
1616
level=WARN msg="No results for Prometheus uptime metric, you might have set uptime config option to a missing metric, please check your config" name=prom1 metric=prometheus_ready
1717
level=WARN msg="Using dummy Prometheus uptime metric results with no gaps" name=prom1 metric=prometheus_ready
1818
rules/1.yml:2 Warning: `http_errors_total[2d]` selector is trying to query Prometheus for 2d worth of metrics, but `prom1` Prometheus server at http://127.0.0.1:7080 is configured to only keep 1d of metrics history. (promql/range_query)

cmd/pint/tests/0103_file_disable.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmp stderr stderr.txt
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
88
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
9-
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
9+
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=[] include=[] exclude=[]
1010
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
1111
level=DEBUG msg="Generated all Prometheus servers" count=1
1212
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines=9-10

cmd/pint/tests/0108_rule_duplicate.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ cmp stderr stderr.txt
55
-- stderr.txt --
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
8-
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=[] include=[] exclude=[]
8+
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=[] include=[] exclude=[]
99
level=ERROR msg="Query returned an error" err="failed to query Prometheus config: Get \"http://127.0.0.1:7108/api/v1/status/config\": dial tcp 127.0.0.1:7108: connect: connection refused" uri=http://127.0.0.1:7108 query=/api/v1/status/config
1010
level=ERROR msg="Query returned an error" err="failed to query Prometheus config: Get \"http://127.0.0.1:7108/api/v1/status/config\": dial tcp 127.0.0.1:7108: connect: connection refused" uri=http://127.0.0.1:7108 query=/api/v1/status/config
1111
level=ERROR msg="Query returned an error" err="failed to query Prometheus config: Get \"http://127.0.0.1:7108/api/v1/status/config\": dial tcp 127.0.0.1:7108: connect: connection refused" uri=http://127.0.0.1:7108 query=/api/v1/status/config

cmd/pint/tests/0109_rule_duplicate_multiple_proms_include.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ cmp stderr stderr.txt
55
-- stderr.txt --
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
8-
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=["^rules/0001.yml$"] exclude=[]
9-
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=["^rules/0002.yml$"] exclude=[]
8+
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=up tags=[] include=["^rules/0001.yml$"] exclude=[]
9+
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=["^rules/0002.yml$"] exclude=[]
1010
-- rules/0001.yml --
1111
- record: "colo:duplicate"
1212
expr: sum(foo) without(job)

cmd/pint/tests/0110_rule_duplicate_multiple_proms_exclude.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ cmp stderr stderr.txt
55
-- stderr.txt --
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
8-
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=[] exclude=["^rules/0002.yml$"]
9-
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=[] exclude=["^rules/0001.yml$"]
8+
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=up tags=[] include=[] exclude=["^rules/0002.yml$"]
9+
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=[] exclude=["^rules/0001.yml$"]
1010
-- rules/0001.yml --
1111
- record: "colo:duplicate"
1212
expr: sum(foo) without(job)

cmd/pint/tests/0115_file_disable_tag.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmp stderr stderr.txt
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
88
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
9-
level=INFO msg="Configured new Prometheus server" name=prom uris=1 tags=["foo","bar"] include=[] exclude=[]
9+
level=INFO msg="Configured new Prometheus server" name=prom uris=1 uptime=up tags=["foo","bar"] include=[] exclude=[]
1010
level=DEBUG msg="Starting query workers" name=prom uri=http://127.0.0.1:7103 workers=16
1111
level=DEBUG msg="Generated all Prometheus servers" count=1
1212
level=DEBUG msg="Found recording rule" path=rules/0001.yml record=colo:test1 lines="6 8"

cmd/pint/tests/0144_discovery_filepath.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.exampl
1919
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom2.yml
2020
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom2"}
2121
level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true
22-
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=["^.*$"]
22+
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 uptime=up tags=["name/prom1"] include=[] exclude=["^.*$"]
2323
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16
2424
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16
25-
level=INFO msg="Configured new Prometheus server" name=prom2 uris=2 tags=["name/prom2"] include=[] exclude=["^.*$"]
25+
level=INFO msg="Configured new Prometheus server" name=prom2 uris=2 uptime=up tags=["name/prom2"] include=[] exclude=["^.*$"]
2626
level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2.example.com workers=16
2727
level=DEBUG msg="Starting query workers" name=prom2 uri=https://prom2-backup.example.com workers=16
2828
level=DEBUG msg="Generated all Prometheus servers" count=2

cmd/pint/tests/0145_discovery_filepath_dup.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmp stderr stderr.txt
66
level=INFO msg="Loading configuration file" path=.pint.hcl
77
level=INFO msg="Finding all rules to check" paths=["rules"]
88
level=DEBUG msg="File parsed" path=rules/0001.yml rules=1
9-
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=[] exclude=[]
9+
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=[] exclude=[]
1010
level=DEBUG msg="Starting query workers" name=prom2 uri=https://unique.example.com workers=16
1111
level=INFO msg="Finding Prometheus servers using file paths" dir=servers match=^(?P<name>\w+).ya?ml$
1212
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom1.yaml
@@ -21,7 +21,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.exampl
2121
level=DEBUG msg="Path discovery match" match=^(?P<name>\w+).ya?ml$ path=prom2.yml
2222
level=DEBUG msg="Extracted regexp variables" regexp=^(?P<name>\w+).ya?ml$ vars={"name":"prom2"}
2323
level=DEBUG msg="Rendered Prometheus server" name=prom2 uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["name/prom2"] required=true
24-
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 tags=["name/prom1"] include=[] exclude=[]
24+
level=INFO msg="Configured new Prometheus server" name=prom1 uris=2 uptime=up tags=["name/prom1"] include=[] exclude=[]
2525
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1.example.com workers=16
2626
level=DEBUG msg="Starting query workers" name=prom1 uri=https://prom1-backup.example.com workers=16
2727
level=DEBUG msg="Stopping query workers" name=prom2 uri=https://unique.example.com

cmd/pint/tests/0149_discovery_prom.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
1818
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=[] required=false
1919
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7149
2020
level=DEBUG msg="Added new failover URI" name=prom-ha uri=https://prom2.example.com
21-
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"]
21+
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 uptime=up tags=[] include=[] exclude=["^.*$"]
2222
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
2323
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16
2424
level=DEBUG msg="Generated all Prometheus servers" count=1

cmd/pint/tests/0150_discovery_prom_dup_tags.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
1818
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=["X-Host"] timeout=5s concurrency=16 rateLimit=100 uptime=up tags=["prom2"] required=false
1919
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7150
2020
level=WARN msg="Duplicated prometheus server with different tags" name=prom-ha a=["prom2"] b=["prom1"]
21-
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=["prom1"] include=[] exclude=[]
21+
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 uptime=up tags=["prom1"] include=[] exclude=[]
2222
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
2323
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
2424
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"

cmd/pint/tests/0152_discovery_prom_dup_uptime.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ level=DEBUG msg="Parsed response" uri=http://127.0.0.1:7152 query=prometheus_rea
2626
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=prom2 tags=[] required=false
2727
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=5s concurrency=16 rateLimit=100 uptime=prom2 tags=[] required=false
2828
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7152
29-
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 tags=[] include=[] exclude=["^.*$"]
29+
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=2 uptime=prom1 tags=[] include=[] exclude=["^.*$"]
3030
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
3131
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom2.example.com workers=16
3232
level=DEBUG msg="Generated all Prometheus servers" count=1

cmd/pint/tests/0155_discovery_prom_dup_include.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
1818
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=2m0s concurrency=16 rateLimit=100 uptime=up tags=[] required=false
1919
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7155
2020
level=WARN msg="Duplicated prometheus server with different include" name=prom-ha a=["^prom2$"] b=["^prom1$"]
21-
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=["^prom1$"] exclude=[]
21+
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 uptime=up tags=[] include=["^prom1$"] exclude=[]
2222
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
2323
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
2424
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"

cmd/pint/tests/0156_discovery_prom_dup_exclude.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom1.exam
1818
level=DEBUG msg="Rendered Prometheus server" name=prom-ha uri=https://prom2.example.com headers=[] timeout=2m0s concurrency=16 rateLimit=100 uptime=up tags=[] required=false
1919
level=DEBUG msg="Stopping query workers" name=discovery uri=http://127.0.0.1:7156
2020
level=WARN msg="Duplicated prometheus server with different exclude" name=prom-ha a=["^prom2$"] b=["^prom1$"]
21-
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 tags=[] include=[] exclude=["^prom1$"]
21+
level=INFO msg="Configured new Prometheus server" name=prom-ha uris=1 uptime=up tags=[] include=[] exclude=["^prom1$"]
2222
level=DEBUG msg="Starting query workers" name=prom-ha uri=https://prom1.example.com workers=16
2323
level=DEBUG msg="Stopping query workers" name=prom-ha uri=https://prom1.example.com
2424
level=ERROR msg="Fatal error" err="Duplicated name for Prometheus server definition: prom-ha"

cmd/pint/tests/0157_series_other_servers.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ cmp stderr stderr.txt
1515
-- stderr.txt --
1616
level=INFO msg="Loading configuration file" path=.pint.hcl
1717
level=INFO msg="Finding all rules to check" paths=["rules"]
18-
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 tags=[] include=["^rules/1.yml$"] exclude=[]
19-
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 tags=[] include=["^rules/2.yml$"] exclude=[]
18+
level=INFO msg="Configured new Prometheus server" name=prom1 uris=1 uptime=up tags=[] include=["^rules/1.yml$"] exclude=[]
19+
level=INFO msg="Configured new Prometheus server" name=prom2 uris=1 uptime=up tags=[] include=["^rules/2.yml$"] exclude=[]
2020
level=WARN msg="No results for Prometheus uptime metric, you might have set uptime config option to a missing metric, please check your config" name=prom1 metric=up
2121
level=WARN msg="Using dummy Prometheus uptime metric results with no gaps" name=prom1 metric=up
2222
rules/1.yml:5 Bug: `prom1` Prometheus server at http://127.0.0.1:7157 didn't have any series for `only_on_prom2` metric in the last 1w. (promql/series)

docs/changelog.md

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog
22

3+
## v0.49.1
4+
5+
### Fixed
6+
7+
- `alerts/count` check wasn't using `uptime` field from `prometheus` config blocks
8+
for metric gap detection.
9+
310
## v0.49.0
411

512
### Added

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/cloudflare/pint
22

3-
go 1.21.4
3+
go 1.21.3
44

55
require (
66
github.com/cespare/xxhash/v2 v2.2.0

internal/checks/alerts_count.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func (c AlertsCheck) Check(ctx context.Context, _ string, rule parser.Rule, _ []
7777
}
7878

7979
if len(qr.Series.Ranges) > 0 {
80-
promUptime, err := c.prom.RangeQuery(ctx, "count(up)", params)
80+
promUptime, err := c.prom.RangeQuery(ctx, fmt.Sprintf("count(%s)", c.prom.UptimeMetric()), params)
8181
if err != nil {
8282
slog.Warn("Cannot detect Prometheus uptime gaps", slog.Any("err", err), slog.String("name", c.prom.Name()))
8383
} else {

internal/config/prometheus.go

+1
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ func (pg *PrometheusGenerator) addServer(server *promapi.FailoverGroup) error {
235235
"Configured new Prometheus server",
236236
slog.String("name", server.Name()),
237237
slog.Int("uris", server.ServerCount()),
238+
slog.String("uptime", server.UptimeMetric()),
238239
slog.Any("tags", server.Tags()),
239240
slog.Any("include", server.Include()),
240241
slog.Any("exclude", server.Exclude()),

tools/gofumpt/go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module _
22

3-
go 1.21.4
3+
go 1.21.3
44

55
require mvdan.cc/gofumpt v0.5.0
66

tools/goimports/go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module _
22

3-
go 1.21.4
3+
go 1.21.3
44

55
require golang.org/x/tools v0.14.0
66

tools/golangci-lint/go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module _
22

3-
go 1.21.4
3+
go 1.21.3
44

55
require github.com/golangci/golangci-lint v1.55.2
66

0 commit comments

Comments
 (0)