Skip to content

Commit

Permalink
Warn about topk, bottomk and other sampling functions
Browse files Browse the repository at this point in the history
Fixes #820.
  • Loading branch information
prymitive committed Oct 22, 2024
1 parent 8856982 commit e45f978
Show file tree
Hide file tree
Showing 11 changed files with 606 additions and 31 deletions.
12 changes: 12 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## v0.68.0

### Added

- [promql/fragile](checks/promql/fragile.md) will now warn when alerting rules are using
one of the aggregation operation that can return different series on every evaluation,
which can cause alert floppiness - [#820](https://github.com/cloudflare/pint/issues/820).

### Fixed

- Don't try to create GitLab comments on unmodified lines - [#1147](https://github.com/cloudflare/pint/pull/1147).

## v0.67.0

### Added
Expand Down
1 change: 0 additions & 1 deletion internal/checks/alerts_template.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,6 @@ func (c TemplateCheck) Check(ctx context.Context, _ discovery.Path, rule parser.
func (c TemplateCheck) checkHumanizeIsNeeded(node *parser.PromQLNode) (problems []exprProblem) {
for _, call := range utils.HasOuterRate(node) {
problems = append(problems, exprProblem{
expr: call.String(),
text: fmt.Sprintf("Using the value of `%s` inside this annotation might be hard to read, consider using one of humanize template functions to make it more human friendly.", call),
details: "[Click here](https://prometheus.io/docs/prometheus/latest/configuration/template_reference/) for a full list of all available template functions.",
severity: Information,
Expand Down
1 change: 0 additions & 1 deletion internal/checks/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ type RuleChecker interface {
}

type exprProblem struct {
expr string
text string
details string
severity Severity
Expand Down
4 changes: 0 additions & 4 deletions internal/checks/promql_aggregation.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,12 @@ func (c AggregationCheck) checkNode(node *parser.PromQLNode) (problems []exprPro
if n.Without {
if found && c.keep {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("`%s` label is required and should be preserved when aggregating `%s` rules, remove %s from `without()`.", c.label, c.nameRegex.anchored, c.label),
})
}

if !found && !c.keep {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("`%s` label should be removed when aggregating `%s` rules, use `without(%s, ...)`.", c.label, c.nameRegex.anchored, c.label),
})
}
Expand All @@ -151,14 +149,12 @@ func (c AggregationCheck) checkNode(node *parser.PromQLNode) (problems []exprPro
} else {
if found && !c.keep {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("`%s` label should be removed when aggregating `%s` rules, remove %s from `by()`.", c.label, c.nameRegex.anchored, c.label),
})
}

if !found && c.keep {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("`%s` label is required and should be preserved when aggregating `%s` rules, use `by(%s, ...)`.", c.label, c.nameRegex.anchored, c.label),
})
}
Expand Down
48 changes: 43 additions & 5 deletions internal/checks/promql_fragile.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package checks

import (
"context"
"fmt"
"slices"

promParser "github.com/prometheus/prometheus/promql/parser"

Expand All @@ -12,6 +14,11 @@ import (

const (
FragileCheckName = "promql/fragile"

FragileCheckSamplingDetails = `Alerts are identied by labels, two alerts with identical sets of labels are identical.
If two alerts have the same name but the rest of labels isn't 100% identical then they are two different alerts.
If the same alert query returns results that over time have different labels on them then previous alert instances will resolve and new alerts will be fired.
This can happen when using one of the aggregation operation like topk or bottomk as they can return a different time series each time they are evaluated.`
)

func NewFragileCheck() FragileCheck {
Expand Down Expand Up @@ -46,18 +53,32 @@ func (c FragileCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rul
return nil
}

for _, problem := range c.checkNode(expr.Query) {
for _, problem := range c.checkAggregation(expr.Query) {
problems = append(problems, Problem{
Lines: expr.Value.Lines,
Reporter: c.Reporter(),
Text: problem.text,
Severity: Warning,
Details: problem.details,
Severity: problem.severity,
})
}

if rule.AlertingRule != nil {
for _, problem := range c.checkSampling(expr.Query) {
problems = append(problems, Problem{
Lines: expr.Value.Lines,
Reporter: c.Reporter(),
Text: problem.text,
Details: problem.details,
Severity: problem.severity,
})
}
}

return problems
}

func (c FragileCheck) checkNode(node *parser.PromQLNode) (problems []exprProblem) {
func (c FragileCheck) checkAggregation(node *parser.PromQLNode) (problems []exprProblem) {
if n := utils.HasOuterBinaryExpr(node); n != nil && n.Op != promParser.LOR && n.Op != promParser.LUNLESS {
if n.VectorMatching != nil && n.VectorMatching.On {
goto NEXT
Expand Down Expand Up @@ -89,7 +110,6 @@ func (c FragileCheck) checkNode(node *parser.PromQLNode) (problems []exprProblem
}
if len(series) >= 2 {
p := exprProblem{
expr: node.Expr.String(),
text: "Aggregation using `without()` can be fragile when used inside binary expression because both sides must have identical sets of labels to produce any results, adding or removing labels to metrics used here can easily break the query, consider aggregating using `by()` to ensure consistent labels.",
severity: Warning,
}
Expand All @@ -100,8 +120,26 @@ func (c FragileCheck) checkNode(node *parser.PromQLNode) (problems []exprProblem

NEXT:
for _, child := range node.Children {
problems = append(problems, c.checkNode(child)...)
problems = append(problems, c.checkAggregation(child)...)
}

return problems
}

func (c FragileCheck) checkSampling(node *parser.PromQLNode) (problems []exprProblem) {
src := utils.LabelsSource(node)
if src.Type != utils.AggregateSource {
return nil
}
if src.EmptyLabels {
return nil
}
if !slices.Contains([]string{"topk", "bottomk", "limit", "limit_ratio"}, src.Operation) {
return nil
}
return append(problems, exprProblem{
text: fmt.Sprintf("Using `%s` to select time series might return different set of time series on every query, which would cause flapping alerts.", src.Operation),
details: FragileCheckSamplingDetails,
severity: Warning,
})
}
42 changes: 37 additions & 5 deletions internal/checks/promql_fragile_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package checks_test

import (
"fmt"
"testing"

"github.com/cloudflare/pint/internal/checks"
Expand All @@ -12,6 +13,10 @@ func newFragileCheck(_ *promapi.FailoverGroup) checks.RuleChecker {
return checks.NewFragileCheck()
}

func fragileSampleFunc(s string) string {
return fmt.Sprintf("Using `%s` to select time series might return different set of time series on every query, which would cause flapping alerts.", s)
}

func TestFragileCheck(t *testing.T) {
text := "Aggregation using `without()` can be fragile when used inside binary expression because both sides must have identical sets of labels to produce any results, adding or removing labels to metrics used here can easily break the query, consider aggregating using `by()` to ensure consistent labels."

Expand Down Expand Up @@ -63,7 +68,7 @@ func TestFragileCheck(t *testing.T) {
First: 2,
Last: 2,
},
Reporter: "promql/fragile",
Reporter: checks.FragileCheckName,
Text: text,
Severity: checks.Warning,
},
Expand All @@ -82,7 +87,7 @@ func TestFragileCheck(t *testing.T) {
First: 2,
Last: 2,
},
Reporter: "promql/fragile",
Reporter: checks.FragileCheckName,
Text: text,
Severity: checks.Warning,
},
Expand All @@ -101,7 +106,7 @@ func TestFragileCheck(t *testing.T) {
First: 2,
Last: 2,
},
Reporter: "promql/fragile",
Reporter: checks.FragileCheckName,
Text: text,
Severity: checks.Warning,
},
Expand All @@ -120,7 +125,7 @@ func TestFragileCheck(t *testing.T) {
First: 2,
Last: 2,
},
Reporter: "promql/fragile",
Reporter: checks.FragileCheckName,
Text: text,
Severity: checks.Warning,
},
Expand All @@ -139,7 +144,7 @@ func TestFragileCheck(t *testing.T) {
First: 2,
Last: 2,
},
Reporter: "promql/fragile",
Reporter: checks.FragileCheckName,
Text: text,
Severity: checks.Warning,
},
Expand Down Expand Up @@ -216,6 +221,33 @@ func TestFragileCheck(t *testing.T) {
prometheus: noProm,
problems: noProblems,
},
{
description: "warns about topk() as source of series",
content: "- alert: foo\n expr: topk(10, foo)\n",
checker: newFragileCheck,
prometheus: noProm,
problems: func(_ string) []checks.Problem {
return []checks.Problem{
{
Lines: parser.LineRange{
First: 2,
Last: 2,
},
Reporter: checks.FragileCheckName,
Text: fragileSampleFunc("topk"),
Details: checks.FragileCheckSamplingDetails,
Severity: checks.Warning,
},
}
},
},
{
description: "ignores aggregated topk()",
content: "- alert: foo\n expr: min(topk(10, foo)) > 5000\n",
checker: newFragileCheck,
prometheus: noProm,
problems: noProblems,
},
}

runTests(t, testCases)
Expand Down
1 change: 0 additions & 1 deletion internal/checks/promql_range_query.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ func (c RangeQueryCheck) checkNode(ctx context.Context, node *parser.PromQLNode,
if n, ok := node.Expr.(*promParser.MatrixSelector); ok {
if n.Range > retention {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("`%s` selector is trying to query Prometheus for %s worth of metrics, but %s",
node.Expr, model.Duration(n.Range), reason),
severity: Warning,
Expand Down
5 changes: 0 additions & 5 deletions internal/checks/promql_rate.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
}
if m.Range < cfg.Config.Global.ScrapeInterval*time.Duration(c.minIntervals) {
p := exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("Duration for `%s()` must be at least %d x scrape_interval, %s is using `%s` scrape_interval.",
n.Func.Name, c.minIntervals, promText(c.prom.Name(), cfg.URI), output.HumanizeDuration(cfg.Config.Global.ScrapeInterval)),
details: RateCheckDetails,
Expand All @@ -121,7 +120,6 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
expr: s.Name,
text: text,
severity: severity,
})
Expand All @@ -130,7 +128,6 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
for _, m := range metadata.Metadata {
if m.Type != v1.MetricTypeCounter && m.Type != v1.MetricTypeUnknown {
problems = append(problems, exprProblem{
expr: s.Name,
text: fmt.Sprintf("`%s()` should only be used with counters but `%s` is a %s according to metrics metadata from %s.",
n.Func.Name, s.Name, m.Type, promText(c.prom.Name(), metadata.URI)),
details: RateCheckDetails,
Expand All @@ -153,7 +150,6 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
expr: sv.Name,
text: text,
severity: severity,
})
Expand All @@ -162,7 +158,6 @@ func (c RateCheck) checkNode(ctx context.Context, node *parser.PromQLNode, entri
for _, m := range metadata.Metadata {
if m.Type == v1.MetricTypeCounter {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("`rate(sum(counter))` chain detected, `%s` is called here on results of `%s`, calling `rate()` on `sum()` results will return bogus results, always `sum(rate(counter))`, never `rate(sum(counter))`.",
node.Expr, sm),
severity: Bug,
Expand Down
9 changes: 0 additions & 9 deletions internal/checks/promql_vector_matching.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: text,
severity: severity,
})
Expand Down Expand Up @@ -125,7 +124,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
for k, lv := range lhsMatchers {
if rv, ok := rhsMatchers[k]; ok && rv != lv {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("The left hand side uses `{%s=%q}` while the right hand side uses `{%s=%q}`, this will never match.", k, lv, k, rv),
details: VectorMatchingCheckDetails,
severity: Bug,
Expand All @@ -139,7 +137,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: text,
severity: severity,
})
Expand All @@ -153,7 +150,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
if err != nil {
text, severity := textAndSeverityFromError(err, c.Reporter(), c.prom.Name(), Bug)
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: text,
severity: severity,
})
Expand All @@ -167,7 +163,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
for _, name := range n.VectorMatching.MatchingLabels {
if !leftLabels.hasName(name) && rightLabels.hasName(name) {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf(
"Using `on(%s)` won't produce any results on %s because results from the left hand side of the query don't have this label: `%s`.",
name, promText(c.prom.Name(), qr.URI), node.Expr.(*promParser.BinaryExpr).LHS),
Expand All @@ -177,7 +172,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
}
if leftLabels.hasName(name) && !rightLabels.hasName(name) {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("Using `on(%s)` won't produce any results on %s because results from the right hand side of the query don't have this label: `%s`.",
name, promText(c.prom.Name(), qr.URI), node.Expr.(*promParser.BinaryExpr).RHS),
details: VectorMatchingCheckDetails,
Expand All @@ -186,7 +180,6 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
}
if !leftLabels.hasName(name) && !rightLabels.hasName(name) {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("Using `on(%s)` won't produce any results on %s because results from both sides of the query don't have this label: `%s`.",
name, promText(c.prom.Name(), qr.URI), node.Expr),
details: VectorMatchingCheckDetails,
Expand All @@ -198,15 +191,13 @@ func (c VectorMatchingCheck) checkNode(ctx context.Context, node *parser.PromQLN
l, r := leftLabels.getFirstNonOverlap(rightLabels)
if len(n.VectorMatching.MatchingLabels) == 0 {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("This query will never return anything on %s because results from the right and the left hand side have different labels: `%s` != `%s`. Failing query: `%s`.",
promText(c.prom.Name(), qr.URI), l, r, node.Expr),
details: VectorMatchingCheckDetails,
severity: Bug,
})
} else {
problems = append(problems, exprProblem{
expr: node.Expr.String(),
text: fmt.Sprintf("Using `ignoring(%s)` won't produce any results on %s because results from both sides of the query have different labels: `%s` != `%s`. Failing query: `%s`.",
strings.Join(n.VectorMatching.MatchingLabels, ","), promText(c.prom.Name(), qr.URI), l, r, node.Expr),
details: VectorMatchingCheckDetails,
Expand Down
Loading

0 comments on commit e45f978

Please sign in to comment.