Skip to content

Commit 160a26c

Browse files
authored
Merge pull request #1050 from cloudflare/truncate
Don't list all possible prometheus servers
2 parents 9155063 + fe18dfd commit 160a26c

File tree

3 files changed

+152
-8
lines changed

3 files changed

+152
-8
lines changed

internal/checks/base_test.go

+14-3
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,15 @@ type problemsFn func(string) []checks.Problem
9393

9494
type newPrometheusFn func(string) *promapi.FailoverGroup
9595

96-
type newCtxFn func() context.Context
96+
type newCtxFn func(string) context.Context
97+
98+
type otherPromsFn func(string) []*promapi.FailoverGroup
9799

98100
type checkTest struct {
99101
description string
100102
content string
101103
prometheus newPrometheusFn
104+
otherProms otherPromsFn
102105
ctx newCtxFn
103106
checker newCheckFn
104107
entries []discovery.Entry
@@ -130,20 +133,28 @@ func runTests(t *testing.T, testCases []checkTest) {
130133
}
131134

132135
var proms []*promapi.FailoverGroup
136+
reg := prometheus.NewRegistry()
133137
prom := tc.prometheus(uri)
134138
if prom != nil {
135139
proms = append(proms, prom)
136-
reg := prometheus.NewRegistry()
137140
prom.StartWorkers(reg)
138141
defer prom.Close(reg)
139142
}
140143

144+
if tc.otherProms != nil {
145+
for _, op := range tc.otherProms(uri) {
146+
proms = append(proms, op)
147+
op.StartWorkers(reg)
148+
defer op.Close(reg)
149+
}
150+
}
151+
141152
entries, err := parseContent(tc.content)
142153
require.NoError(t, err, "cannot parse rule content")
143154
for _, entry := range entries {
144155
ctx := context.WithValue(context.Background(), promapi.AllPrometheusServers, proms)
145156
if tc.ctx != nil {
146-
ctx = tc.ctx()
157+
ctx = tc.ctx(uri)
147158
}
148159
problems := tc.checker(prom).Check(ctx, entry.Path, entry.Rule, tc.entries)
149160
require.Equal(t, tc.problems(uri), problems)

internal/checks/promql_series.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"log/slog"
77
"regexp"
8+
"strconv"
89
"strings"
910
"time"
1011

@@ -573,7 +574,7 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string
573574
buf.WriteString(query)
574575
buf.WriteString("` was found on other prometheus servers:\n\n")
575576

576-
var matches int
577+
var matches, skipped int
577578
for _, prom := range servers {
578579
slog.Debug("Checking if metric exists on any other Prometheus server", slog.String("check", c.Reporter()), slog.String("selector", query))
579580

@@ -591,6 +592,10 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string
591592

592593
if series > 0 {
593594
matches++
595+
if matches > 10 {
596+
skipped++
597+
continue
598+
}
594599
buf.WriteString("- [")
595600
buf.WriteString(prom.Name())
596601
buf.WriteString("](")
@@ -600,6 +605,11 @@ func (c SeriesCheck) checkOtherServer(ctx context.Context, query string) string
600605
buf.WriteString(")\n")
601606
}
602607
}
608+
if skipped > 0 {
609+
buf.WriteString("- and ")
610+
buf.WriteString(strconv.Itoa(skipped))
611+
buf.WriteString(" other server(s).\n")
612+
}
603613

604614
buf.WriteString("\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n")
605615

internal/checks/promql_series_test.go

+127-4
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ func TestSeriesCheck(t *testing.T) {
437437
{
438438
description: "#2 series never present, custom range",
439439
content: "- record: foo\n expr: sum(notfound)\n",
440-
ctx: func() context.Context {
440+
ctx: func(_ string) context.Context {
441441
s := checks.PromqlSeriesSettings{
442442
LookbackRange: "3d",
443443
LookbackStep: "6m",
@@ -669,7 +669,7 @@ func TestSeriesCheck(t *testing.T) {
669669
description: "#2 series never present but metric ignored",
670670
content: "- record: foo\n expr: sum(notfound)\n",
671671
checker: newSeriesCheck,
672-
ctx: func() context.Context {
672+
ctx: func(_ string) context.Context {
673673
s := checks.PromqlSeriesSettings{
674674
IgnoreMetrics: []string{"foo", "bar", "not.+"},
675675
}
@@ -1413,7 +1413,7 @@ func TestSeriesCheck(t *testing.T) {
14131413
description: "#4 metric was present but disappeared over 1h ago / ignored",
14141414
content: "- record: foo\n expr: sum(found{job=\"foo\", instance=\"bar\"})\n",
14151415
checker: newSeriesCheck,
1416-
ctx: func() context.Context {
1416+
ctx: func(_ string) context.Context {
14171417
s := checks.PromqlSeriesSettings{
14181418
IgnoreMetrics: []string{"foo", "found", "not.+"},
14191419
}
@@ -1990,7 +1990,7 @@ func TestSeriesCheck(t *testing.T) {
19901990
description: "#5 metric was present but not with label value",
19911991
content: "- record: foo\n expr: sum(found{notfound=\"notfound\", instance=~\".+\", not!=\"negative\", instance!~\"bad\"})\n",
19921992
checker: newSeriesCheck,
1993-
ctx: func() context.Context {
1993+
ctx: func(_ string) context.Context {
19941994
s := checks.PromqlSeriesSettings{
19951995
IgnoreMetrics: []string{"foo", "bar", "found"},
19961996
}
@@ -3771,6 +3771,129 @@ func TestSeriesCheck(t *testing.T) {
37713771
}
37723772
},
37733773
},
3774+
{
3775+
description: "series not present on other servers",
3776+
content: "- record: foo\n expr: notfound\n",
3777+
checker: newSeriesCheck,
3778+
prometheus: newSimpleProm,
3779+
otherProms: func(uri string) []*promapi.FailoverGroup {
3780+
var proms []*promapi.FailoverGroup
3781+
for i := range 5 {
3782+
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
3783+
}
3784+
return proms
3785+
},
3786+
problems: func(uri string) []checks.Problem {
3787+
return []checks.Problem{
3788+
{
3789+
Lines: parser.LineRange{
3790+
First: 2,
3791+
Last: 2,
3792+
},
3793+
Reporter: checks.SeriesCheckName,
3794+
Text: noMetricText("prom", uri, "notfound", "1w"),
3795+
Details: checks.SeriesCheckCommonProblemDetails,
3796+
Severity: checks.Bug,
3797+
},
3798+
}
3799+
},
3800+
mocks: []*prometheusMock{
3801+
{
3802+
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
3803+
resp: respondWithEmptyVector(),
3804+
},
3805+
{
3806+
conds: []requestCondition{requireQueryPath},
3807+
resp: respondWithEmptyVector(),
3808+
},
3809+
{
3810+
conds: []requestCondition{requireRangeQueryPath},
3811+
resp: respondWithEmptyMatrix(),
3812+
},
3813+
},
3814+
},
3815+
{
3816+
description: "series present on other servers",
3817+
content: "- record: foo\n expr: notfound\n",
3818+
checker: newSeriesCheck,
3819+
prometheus: newSimpleProm,
3820+
otherProms: func(uri string) []*promapi.FailoverGroup {
3821+
var proms []*promapi.FailoverGroup
3822+
for i := range 5 {
3823+
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
3824+
}
3825+
return proms
3826+
},
3827+
problems: func(uri string) []checks.Problem {
3828+
return []checks.Problem{
3829+
{
3830+
Lines: parser.LineRange{
3831+
First: 2,
3832+
Last: 2,
3833+
},
3834+
Reporter: checks.SeriesCheckName,
3835+
Text: noMetricText("prom", uri, "notfound", "1w"),
3836+
Details: fmt.Sprintf("`notfound` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=notfound)\n- [prom1](%s/other/graph?g0.expr=notfound)\n- [prom2](%s/other/graph?g0.expr=notfound)\n- [prom3](%s/other/graph?g0.expr=notfound)\n- [prom4](%s/other/graph?g0.expr=notfound)\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n", uri, uri, uri, uri, uri),
3837+
Severity: checks.Bug,
3838+
},
3839+
}
3840+
},
3841+
mocks: []*prometheusMock{
3842+
{
3843+
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
3844+
resp: respondWithSingleInstantVector(),
3845+
},
3846+
{
3847+
conds: []requestCondition{requireQueryPath},
3848+
resp: respondWithEmptyVector(),
3849+
},
3850+
{
3851+
conds: []requestCondition{requireRangeQueryPath},
3852+
resp: respondWithEmptyMatrix(),
3853+
},
3854+
},
3855+
},
3856+
{
3857+
description: "series present on other servers / 15",
3858+
content: "- record: foo\n expr: notfound\n",
3859+
checker: newSeriesCheck,
3860+
prometheus: newSimpleProm,
3861+
otherProms: func(uri string) []*promapi.FailoverGroup {
3862+
var proms []*promapi.FailoverGroup
3863+
for i := range 15 {
3864+
proms = append(proms, simpleProm(fmt.Sprintf("prom%d", i), uri+"/other", time.Second, false))
3865+
}
3866+
return proms
3867+
},
3868+
problems: func(uri string) []checks.Problem {
3869+
return []checks.Problem{
3870+
{
3871+
Lines: parser.LineRange{
3872+
First: 2,
3873+
Last: 2,
3874+
},
3875+
Reporter: checks.SeriesCheckName,
3876+
Text: noMetricText("prom", uri, "notfound", "1w"),
3877+
Details: fmt.Sprintf("`notfound` was found on other prometheus servers:\n\n- [prom0](%s/other/graph?g0.expr=notfound)\n- [prom1](%s/other/graph?g0.expr=notfound)\n- [prom2](%s/other/graph?g0.expr=notfound)\n- [prom3](%s/other/graph?g0.expr=notfound)\n- [prom4](%s/other/graph?g0.expr=notfound)\n- [prom5](%s/other/graph?g0.expr=notfound)\n- [prom6](%s/other/graph?g0.expr=notfound)\n- [prom7](%s/other/graph?g0.expr=notfound)\n- [prom8](%s/other/graph?g0.expr=notfound)\n- [prom9](%s/other/graph?g0.expr=notfound)\n- and 5 other server(s).\n\nYou might be trying to deploy this rule to the wrong Prometheus server instance.\n", uri, uri, uri, uri, uri, uri, uri, uri, uri, uri),
3878+
Severity: checks.Bug,
3879+
},
3880+
}
3881+
},
3882+
mocks: []*prometheusMock{
3883+
{
3884+
conds: []requestCondition{requestPathCond{path: "/other/api/v1/query"}},
3885+
resp: respondWithSingleInstantVector(),
3886+
},
3887+
{
3888+
conds: []requestCondition{requireQueryPath},
3889+
resp: respondWithEmptyVector(),
3890+
},
3891+
{
3892+
conds: []requestCondition{requireRangeQueryPath},
3893+
resp: respondWithEmptyMatrix(),
3894+
},
3895+
},
3896+
},
37743897
}
37753898
runTests(t, testCases)
37763899
}

0 commit comments

Comments
 (0)