Skip to content

Commit

Permalink
Merge pull request #1206 from cloudflare/rate
Browse files Browse the repository at this point in the history
Warn about wrong rate() usage even without metadata
  • Loading branch information
prymitive authored Nov 28, 2024
2 parents 0fea32c + 00e292a commit aeb0788
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 179 deletions.
3 changes: 3 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
{"status.üp"} == 0
```

- [promql/rate](checks/promql/rate.md) will now report a warning if it detects a `rate(sum(...))`
but doesn't have metadata to confirm if `...` is a counter or not.

## v0.67.3

### Fixed
Expand Down
36 changes: 27 additions & 9 deletions internal/checks/promql_rate.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,12 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
continue
}
if e.Rule.RecordingRule != nil && e.Rule.RecordingRule.Expr.SyntaxError == nil && e.Rule.RecordingRule.Record.Value == s.Name {
for _, sm := range utils.HasOuterSum(e.Rule.RecordingRule.Expr.Query) {
if sv, ok := sm.Expr.(*promParser.VectorSelector); ok {
metadata, err := c.prom.Metadata(ctx, sv.Name)
for _, src := range utils.LabelsSource(e.Rule.RecordingRule.Expr.Value.Value, e.Rule.RecordingRule.Expr.Query.Expr) {
if src.Type != utils.AggregateSource {
continue
}
for _, vs := range src.Selectors {
metadata, err := c.prom.Metadata(ctx, vs.Name)
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
Expand All @@ -155,15 +158,30 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
})
continue
}
canReport := true
severity := Warning
for _, m := range metadata.Metadata {
if m.Type == v1.MetricTypeCounter {
problems = append(problems, exprProblem{
text: fmt.Sprintf("`rate(sum(counter))` chain detected, `%s` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.",
node.Expr, sm),
severity: Bug,
})
// nolint:exhaustive
switch m.Type {
case v1.MetricTypeCounter:
severity = Bug
default:
canReport = false
}
}
if !canReport {
continue
}
problems = append(problems, exprProblem{
text: fmt.Sprintf("`rate(%s(counter))` chain detected, `%s` is called here on results of `%s(%s)`.",
src.Operation, node.Expr, src.Operation, vs),
details: fmt.Sprintf(
"You can only calculate `rate()` directly from a counter metric. "+
"Calling `rate()` on `%s()` results will return bogus results because `%s()` will hide information on when each counter resets. "+
"You must first calculate `rate()` before calling any aggregation function. Always `sum(rate(counter))`, never `rate(sum(counter))`",
src.Operation, src.Operation),
severity: severity,
})
}
}
}
Expand Down
56 changes: 55 additions & 1 deletion internal/checks/promql_rate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ func notCounterText(name, uri, fun, metric, kind string) string {
}

func rateSumText(rateName, sumExpr string) string {
return fmt.Sprintf("`rate(sum(counter))` chain detected, `rate(%s)` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.", rateName, sumExpr)
return fmt.Sprintf("`rate(sum(counter))` chain detected, `rate(%s)` is called here on results of `%s`.", rateName, sumExpr)
}

func rateSumDetails() string {
return "You can only calculate `rate()` directly from a counter metric. Calling `rate()` on `sum()` results will return bogus results because `sum()` will hide information on when each counter resets. You must first calculate `rate()` before calling any aggregation function. Always `sum(rate(counter))`, never `rate(sum(counter))`"
}

func TestRateCheck(t *testing.T) {
Expand Down Expand Up @@ -678,6 +682,7 @@ func TestRateCheck(t *testing.T) {
},
Reporter: "promql/rate",
Text: rateSumText("my:sum[5m]", "sum(foo)"),
Details: rateSumDetails(),
Severity: checks.Bug,
},
}
Expand Down Expand Up @@ -839,6 +844,55 @@ func TestRateCheck(t *testing.T) {
},
},
},
{
description: "sum(rate(sum)) / sum(rate(sum))",
content: `
- alert: Plexi_Worker_High_Signing_Latency
expr: |
sum(
rate(global:response_time_sum{namespace!~"test[.].+"}[15m])
) by (environment, namespace)
/
sum(
rate(global:response_time_count{namespace!~"test[.].+"}[15m])
) by (environment, namespace)
> 3000
`,
checker: newRateCheck,
prometheus: newSimpleProm,
entries: mustParseContent(`
- record: global:response_time_sum
expr: sum(response_time_sum:rate2m)
- record: response_time_sum:rate2m
expr: rate(response_time_sum[2m])
`),
problems: func(_ string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 3,
Last: 11,
},
Reporter: "promql/rate",
Text: rateSumText(`global:response_time_sum{namespace!~"test[.].+"}[15m]`, "sum(response_time_sum:rate2m)"),
Details: rateSumDetails(),
Severity: checks.Warning,
},
}
},
mocks: []*prometheusMock{
{
conds: []requestCondition{requireConfigPath},
resp: configResponse{yaml: "global:\n scrape_interval: 53s\n"},
},
{
conds: []requestCondition{
requireMetadataPath,
},
resp: metadataResponse{metadata: map[string][]v1.Metadata{}},
},
},
},
}
runTests(t, testCases)
}
72 changes: 0 additions & 72 deletions internal/parser/utils/sum.go

This file was deleted.

97 changes: 0 additions & 97 deletions internal/parser/utils/sum_test.go

This file was deleted.

0 comments on commit aeb0788

Please sign in to comment.