Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Warn about wrong rate() usage even without metadata #1206

Merged
merged 1 commit into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
{"status.üp"} == 0
```

- [promql/rate](checks/promql/rate.md) will now report a warning if it detects a `rate(sum(...))`
but doesn't have metadata to confirm if `...` is a counter or not.

## v0.67.3

### Fixed
Expand Down
36 changes: 27 additions & 9 deletions internal/checks/promql_rate.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,12 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
continue
}
if e.Rule.RecordingRule != nil && e.Rule.RecordingRule.Expr.SyntaxError == nil && e.Rule.RecordingRule.Record.Value == s.Name {
for _, sm := range utils.HasOuterSum(e.Rule.RecordingRule.Expr.Query) {
if sv, ok := sm.Expr.(*promParser.VectorSelector); ok {
metadata, err := c.prom.Metadata(ctx, sv.Name)
for _, src := range utils.LabelsSource(e.Rule.RecordingRule.Expr.Value.Value, e.Rule.RecordingRule.Expr.Query.Expr) {
if src.Type != utils.AggregateSource {
continue
}
for _, vs := range src.Selectors {
metadata, err := c.prom.Metadata(ctx, vs.Name)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
Expand All @@ -155,15 +158,30 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
})
continue
}
canReport := true
severity := Warning
for _, m := range metadata.Metadata {
if m.Type == v1.MetricTypeCounter {
problems = append(problems, exprProblem{
text: fmt.Sprintf("`rate(sum(counter))` chain detected, `%s` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.",
node.Expr, sm),
severity: Bug,
})
// nolint:exhaustive
switch m.Type {
case v1.MetricTypeCounter:
severity = Bug
default:
canReport = false
}
}
if !canReport {
continue
}
problems = append(problems, exprProblem{
text: fmt.Sprintf("`rate(%s(counter))` chain detected, `%s` is called here on results of `%s(%s)`.",
src.Operation, node.Expr, src.Operation, vs),
details: fmt.Sprintf(
"You can only calculate `rate()` directly from a counter metric. "+
"Calling `rate()` on `%s()` results will return bogus results because `%s()` will hide information on when each counter resets. "+
"You must first calculate `rate()` before calling any aggregation function. Always `sum(rate(counter))`, never `rate(sum(counter))`",
src.Operation, src.Operation),
severity: severity,
})
}
}
}
Expand Down
56 changes: 55 additions & 1 deletion internal/checks/promql_rate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ func notCounterText(name, uri, fun, metric, kind string) string {
}

func rateSumText(rateName, sumExpr string) string {
return fmt.Sprintf("`rate(sum(counter))` chain detected, `rate(%s)` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.", rateName, sumExpr)
return fmt.Sprintf("`rate(sum(counter))` chain detected, `rate(%s)` is called here on results of `%s`.", rateName, sumExpr)
}

func rateSumDetails() string {
return "You can only calculate `rate()` directly from a counter metric. Calling `rate()` on `sum()` results will return bogus results because `sum()` will hide information on when each counter resets. You must first calculate `rate()` before calling any aggregation function. Always `sum(rate(counter))`, never `rate(sum(counter))`"
}

func TestRateCheck(t *testing.T) {
Expand Down Expand Up @@ -678,6 +682,7 @@ func TestRateCheck(t *testing.T) {
},
Reporter: "promql/rate",
Text: rateSumText("my:sum[5m]", "sum(foo)"),
Details: rateSumDetails(),
Severity: checks.Bug,
},
}
Expand Down Expand Up @@ -839,6 +844,55 @@ func TestRateCheck(t *testing.T) {
},
},
},
{
description: "sum(rate(sum)) / sum(rate(sum))",
content: `
- alert: Plexi_Worker_High_Signing_Latency
expr: |
sum(
rate(global:response_time_sum{namespace!~"test[.].+"}[15m])
) by (environment, namespace)
/
sum(
rate(global:response_time_count{namespace!~"test[.].+"}[15m])
) by (environment, namespace)
> 3000
`,
checker: newRateCheck,
prometheus: newSimpleProm,
entries: mustParseContent(`
- record: global:response_time_sum
expr: sum(response_time_sum:rate2m)
- record: response_time_sum:rate2m
expr: rate(response_time_sum[2m])
`),
problems: func(_ string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 3,
Last: 11,
},
Reporter: "promql/rate",
Text: rateSumText(`global:response_time_sum{namespace!~"test[.].+"}[15m]`, "sum(response_time_sum:rate2m)"),
Details: rateSumDetails(),
Severity: checks.Warning,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requireConfigPath},
resp: configResponse{yaml: "global:\n scrape_interval: 53s\n"},
},
{
conds: []requestCondition{
requireMetadataPath,
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
},
},
},
}
runTests(t, testCases)
}
72 changes: 0 additions & 72 deletions internal/parser/utils/sum.go

This file was deleted.

97 changes: 0 additions & 97 deletions internal/parser/utils/sum_test.go

This file was deleted.