From b6e96bf525144ec3cbec5b1bf8eb94505eb9d243 Mon Sep 17 00:00:00 2001 From: Rogger Vasquez Date: Mon, 22 Aug 2022 10:36:31 -0500 Subject: [PATCH] rpk: grafana-generate - support public metrics rpk generate grafana-dashboard have a summary section that has some metrics that don't exist in the new /public_metrics endpoint. --- src/go/rpk/pkg/cli/cmd/generate/graf/graph.go | 1 + src/go/rpk/pkg/cli/cmd/generate/grafana.go | 256 +++++++++++++++--- .../rpk/pkg/cli/cmd/generate/grafana_test.go | 2 +- 3 files changed, 227 insertions(+), 32 deletions(-) diff --git a/src/go/rpk/pkg/cli/cmd/generate/graf/graph.go b/src/go/rpk/pkg/cli/cmd/generate/graf/graph.go index b9a229e11be6e..88118cf3e4431 100644 --- a/src/go/rpk/pkg/cli/cmd/generate/graf/graph.go +++ b/src/go/rpk/pkg/cli/cmd/generate/graf/graph.go @@ -26,6 +26,7 @@ type GraphPanel struct { Tooltip Tooltip `json:"tooltip"` AliasColors AliasColors `json:"aliasColors"` SteppedLine bool `json:"steppedLine"` + Interval string `json:"interval,omitempty"` } func (*GraphPanel) Type() string { diff --git a/src/go/rpk/pkg/cli/cmd/generate/grafana.go b/src/go/rpk/pkg/cli/cmd/generate/grafana.go index ab23a03171682..30c16ac3ef597 100644 --- a/src/go/rpk/pkg/cli/cmd/generate/grafana.go +++ b/src/go/rpk/pkg/cli/cmd/generate/grafana.go @@ -107,7 +107,8 @@ func executeGrafanaDashboard(metricsEndpoint string) error { if err != nil { return err } - dashboard := buildGrafanaDashboard(metricFamilies) + isPublicMetrics := strings.Contains(metricsEndpoint, "public_metrics") + dashboard := buildGrafanaDashboard(metricFamilies, isPublicMetrics) jsonSpec, err := json.MarshalIndent(dashboard, "", " ") if err != nil { return err @@ -124,13 +125,19 @@ func executeGrafanaDashboard(metricsEndpoint string) error { func buildGrafanaDashboard( metricFamilies map[string]*dto.MetricFamily, + isPublicMetrics bool, ) graf.Dashboard { intervals := []string{"5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"} timeOptions := []string{"5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"} - summaryPanels := buildSummary(metricFamilies) + var summaryPanels []graf.Panel + if isPublicMetrics { + summaryPanels = buildPublicMetricsSummary(metricFamilies) + } else { + summaryPanels = buildSummary(metricFamilies) + } lastY := summaryPanels[len(summaryPanels)-1].GetGridPos().Y + panelHeight rowSet := newRowSet() - rowSet.processRows(metricFamilies) + rowSet.processRows(metricFamilies, isPublicMetrics) rowSet.addCachePerformancePanels(metricFamilies) rows := rowSet.finalize(lastY) return graf.Dashboard{ @@ -173,7 +180,7 @@ func (rowSet *RowSet) finalize(fromY int) []graf.Panel { return rows } -func (rowSet *RowSet) processRows(metricFamilies map[string]*dto.MetricFamily) { +func (rowSet *RowSet) processRows(metricFamilies map[string]*dto.MetricFamily, isPublicMetrics bool) { names := []string{} for k := range metricFamilies { names = append(names, k) @@ -182,12 +189,14 @@ func (rowSet *RowSet) processRows(metricFamilies map[string]*dto.MetricFamily) { for _, name := range names { family := metricFamilies[name] var panel graf.Panel - if family.GetType() == dto.MetricType_COUNTER { - panel = newCounterPanel(family) + // hack around redpanda_storage_* metrics: these should be gauge + // panels but the metrics type come as COUNTER + if family.GetType() == dto.MetricType_COUNTER && !strings.Contains(name, "redpanda_storage") { + panel = newCounterPanel(family, isPublicMetrics) } else if subtype(family) == "histogram" { - panel = newPercentilePanel(family, 0.95) + panel = newPercentilePanel(family, 0.95, isPublicMetrics) } else { - panel = newGaugePanel(family) + panel = newGaugePanel(family, isPublicMetrics) } if panel == nil { @@ -286,6 +295,8 @@ func buildTemplating() graf.Templating { } } +// buildSummary builds the Summary section of the Redpanda generated grafana +// dashboard that use the /metric endpoint. func buildSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { maxWidth := 24 singleStatW := 2 @@ -334,7 +345,7 @@ func buildSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { if kafkaExists { width := (maxWidth - (singleStatW * 2)) / percentilesNo for i, p := range percentiles { - panel := newPercentilePanel(kafkaFamily, p) + panel := newPercentilePanel(kafkaFamily, p, false) panel.GridPos = graf.GridPos{ H: panelHeight, W: width, @@ -355,7 +366,7 @@ func buildSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { y += rpcLatencyTitle.GridPos.H panels = append(panels, rpcLatencyTitle) for i, p := range percentiles { - panel := newPercentilePanel(rpcFamily, p) + panel := newPercentilePanel(rpcFamily, p, false) panel.GridPos = graf.GridPos{ H: panelHeight, W: width, @@ -380,7 +391,7 @@ func buildSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { readBytesFamily, readBytesExist := metricFamilies["vectorized_storage_log_read_bytes"] writtenBytesFamily, writtenBytesExist := metricFamilies["vectorized_storage_log_written_bytes"] if readBytesExist && writtenBytesExist { - readPanel := newCounterPanel(readBytesFamily) + readPanel := newCounterPanel(readBytesFamily, false) readPanel.GridPos = graf.GridPos{ H: panelHeight, W: width, @@ -389,7 +400,7 @@ func buildSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { } panels = append(panels, readPanel) - writtenPanel := newCounterPanel(writtenBytesFamily) + writtenPanel := newCounterPanel(writtenBytesFamily, false) writtenPanel.GridPos = graf.GridPos{ H: panelHeight, W: width, @@ -402,6 +413,186 @@ func buildSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { return panels } +// buildPublicMetricsSummary builds the Summary section of the Redpanda generated +// grafana dashboard that use the /public_metrics endpoint. +func buildPublicMetricsSummary(metricFamilies map[string]*dto.MetricFamily) []graf.Panel { + maxWidth := 24 + singleStatW := 2 + percentiles := []float32{0.95, 0.99} + percentilesNo := len(percentiles) + panels := []graf.Panel{} + y := 0 + + summaryText := htmlHeader("Redpanda Summary") + summaryTitle := graf.NewTextPanel(summaryText, "html") + summaryTitle.GridPos = graf.GridPos{H: 2, W: maxWidth, X: 0, Y: y} + summaryTitle.Transparent = true + panels = append(panels, summaryTitle) + y += summaryTitle.GridPos.H + + // Nodes Up Panel + nodesUp := graf.NewSingleStatPanel("Nodes Up") + nodesUp.Datasource = datasource + nodesUp.GridPos = graf.GridPos{H: 6, W: singleStatW, X: 0, Y: y} + nodesUp.Targets = []graf.Target{{ + Expr: `redpanda_cluster_brokers`, + Step: 40, + IntervalFactor: 1, + LegendFormat: "Nodes Up", + Instant: true, + }} + nodesUp.Transparent = true + panels = append(panels, nodesUp) + y += nodesUp.GridPos.H + + // Partitions Panel + partitionCount := graf.NewSingleStatPanel("Partitions") + partitionCount.Datasource = datasource + partitionCount.GridPos = graf.GridPos{ + H: 6, + W: singleStatW, + X: 0, + Y: nodesUp.GridPos.H, + } + partitionCount.Targets = []graf.Target{{ + Expr: "redpanda_cluster_partitions", + LegendFormat: "Partition count", + Instant: true, + }} + partitionCount.Transparent = true + panels = append(panels, partitionCount) + y += partitionCount.GridPos.H + + // Latency of Kafka consume/produce requests (p95 - p99) + _, kafkaExists := metricFamilies[`redpanda_kafka_request_latency_seconds`] + if kafkaExists { + width := (maxWidth - singleStatW) / percentilesNo + for i, p := range percentiles { + pTarget := graf.Target{ + Expr: fmt.Sprintf(`histogram_quantile(%.2f, sum(rate(redpanda_kafka_request_latency_seconds_bucket{instance=~"$node", redpanda_request="produce"}[$__rate_interval])) by (le, provider, region, instance, namespace, pod))`, p), + LegendFormat: "node: {{instance}}", + Format: "time_series", + Step: 10, + IntervalFactor: 2, + RefID: "A", + } + pTitle := fmt.Sprintf("Latency of Kafka produce requests (p%.0f) per broker", p*100) + producePanel := newGraphPanel(pTitle, pTarget, "s") + producePanel.Interval = "1m" + producePanel.Lines = true + producePanel.SteppedLine = true + producePanel.NullPointMode = "null as zero" + producePanel.Tooltip.ValueType = "individual" + producePanel.Tooltip.Sort = 0 + producePanel.GridPos = graf.GridPos{ + H: panelHeight, + W: width, + X: i*width + singleStatW, + Y: y, + } + cTarget := graf.Target{ + Expr: fmt.Sprintf(`histogram_quantile(%.2f, sum(rate(redpanda_kafka_request_latency_seconds_bucket{instance=~"$node", redpanda_request="consume"}[$__rate_interval])) by (le, provider, region, instance, namespace, pod))`, p), + LegendFormat: "node: {{instance}}", + Format: "time_series", + Step: 10, + IntervalFactor: 2, + RefID: "A", + } + cTitle := fmt.Sprintf("Latency of Kafka consume requests (p%.0f) per broker", p*100) + consumePanel := newGraphPanel(cTitle, cTarget, "s") + consumePanel.Interval = "1m" + consumePanel.Lines = true + consumePanel.SteppedLine = true + consumePanel.NullPointMode = "null as zero" + consumePanel.Tooltip.ValueType = "individual" + consumePanel.Tooltip.Sort = 0 + consumePanel.GridPos = graf.GridPos{ + H: panelHeight, + W: width, + X: i*width + singleStatW, + Y: producePanel.GridPos.H, + } + panels = append(panels, consumePanel, producePanel) + } + y += panelHeight + } + width := maxWidth / 4 + + // Internal RPC Latency Section + rpcLatencyText := htmlHeader("Internal RPC Latency") + rpcLatencyTitle := graf.NewTextPanel(rpcLatencyText, "html") + rpcLatencyTitle.GridPos = graf.GridPos{H: 2, W: maxWidth / 2, X: 0, Y: y} + rpcLatencyTitle.Transparent = true + rpcFamily, rpcExists := metricFamilies[`redpanda_rpc_request_latency_seconds`] + if rpcExists { + y += rpcLatencyTitle.GridPos.H + panels = append(panels, rpcLatencyTitle) + for i, p := range percentiles { + template := `histogram_quantile(%.2f, sum(rate(%s_bucket{instance=~"$node",redpanda_server="internal"}[$__rate_interval])) by (le, $aggr_criteria))` + expr := fmt.Sprintf(template, p, rpcFamily.GetName()) + target := graf.Target{ + Expr: expr, + LegendFormat: "node: {{instance}}", + Format: "time_series", + Step: 10, + IntervalFactor: 2, + RefID: "A", + } + title := fmt.Sprintf("%s (p%.0f)", rpcFamily.GetHelp(), p*100) + panel := newGraphPanel(title, target, "s") + panel.Interval = "1m" + panel.Lines = true + panel.SteppedLine = true + panel.NullPointMode = "null as zero" + panel.Tooltip.ValueType = "individual" + panel.Tooltip.Sort = 0 + panel.GridPos = graf.GridPos{ + H: panelHeight, + W: width, + X: i * width, + Y: y, + } + panels = append(panels, panel) + } + } + + // Throughput section + throughputText := htmlHeader("Throughput") + throughputTitle := graf.NewTextPanel(throughputText, "html") + throughputTitle.GridPos = graf.GridPos{ + H: 2, + W: maxWidth / 2, + X: rpcLatencyTitle.GridPos.W, + Y: rpcLatencyTitle.GridPos.Y, + } + throughputTitle.Transparent = true + panels = append(panels, throughputTitle) + + reqBytesFamily, reqBytesExists := metricFamilies["redpanda_kafka_request_bytes_total"] + if reqBytesExists { + target := graf.Target{ + Expr: `sum(rate(redpanda_kafka_request_bytes_total[$__rate_interval])) by (redpanda_request)`, + LegendFormat: "redpanda_request: {{redpanda_request}}", + Format: "time_series", + Step: 10, + IntervalFactor: 2, + } + panel := newGraphPanel("Rate - "+reqBytesFamily.GetHelp(), target, "Bps") + panel.Interval = "1m" + panel.Lines = true + panel.GridPos = graf.GridPos{ + H: panelHeight, + W: width * 2, + X: maxWidth / 2, + Y: y, + } + panel.Title = "Throughput of Kafka produce/consume requests for the cluster" + panels = append(panels, panel) + } + + return panels +} + func metricGroup(metric string) string { for _, group := range metricGroups { if strings.Contains(metric, group) { @@ -445,15 +636,14 @@ func fetchMetrics( } func newPercentilePanel( - m *dto.MetricFamily, percentile float32, + m *dto.MetricFamily, percentile float32, isPublicMetrics bool, ) *graf.GraphPanel { - expr := fmt.Sprintf( - `histogram_quantile(%.2f, sum(rate(%s_bucket{instance=~"$node",shard=~"$node_shard"}[2m])) by (le, $aggr_criteria))`, - percentile, - m.GetName(), - ) + template := `histogram_quantile(%.2f, sum(rate(%s_bucket{instance=~"$node",shard=~"$node_shard"}[2m])) by (le, $aggr_criteria))` + if isPublicMetrics { + template = `histogram_quantile(%.2f, sum(rate(%s_bucket{instance=~"$node"}[$__rate_interval])) by (le, $aggr_criteria))` + } target := graf.Target{ - Expr: expr, + Expr: fmt.Sprintf(template, percentile, m.GetName()), LegendFormat: legendFormat(m), Format: "time_series", Step: 10, @@ -467,16 +657,17 @@ func newPercentilePanel( panel.NullPointMode = "null as zero" panel.Tooltip.ValueType = "individual" panel.Tooltip.Sort = 0 + panel.Interval = "1m" return panel } -func newCounterPanel(m *dto.MetricFamily) *graf.GraphPanel { - expr := fmt.Sprintf( - `sum(irate(%s{instance=~"$node",shard=~"$node_shard"}[2m])) by ($aggr_criteria)`, - m.GetName(), - ) +func newCounterPanel(m *dto.MetricFamily, isPublicMetrics bool) *graf.GraphPanel { + template := `sum(irate(%s{instance=~"$node",shard=~"$node_shard"}[2m])) by ($aggr_criteria)` + if isPublicMetrics { + template = `sum(rate(%s{instance=~"$node"}[$__rate_interval])) by ($aggr_criteria)` + } target := graf.Target{ - Expr: expr, + Expr: fmt.Sprintf(template, m.GetName()), LegendFormat: legendFormat(m), Format: "time_series", Step: 10, @@ -485,19 +676,22 @@ func newCounterPanel(m *dto.MetricFamily) *graf.GraphPanel { format := "ops" if strings.Contains(m.GetName(), "bytes") { format = "Bps" + } else if strings.Contains(m.GetName(), "redpanda_scheduler") { + format = "percentunit" } panel := newGraphPanel("Rate - "+m.GetHelp(), target, format) panel.Lines = true + panel.Interval = "1m" return panel } -func newGaugePanel(m *dto.MetricFamily) *graf.GraphPanel { - expr := fmt.Sprintf( - `sum(%s{instance=~"$node",shard=~"$node_shard"}) by ($aggr_criteria)`, - m.GetName(), - ) +func newGaugePanel(m *dto.MetricFamily, isPublicMetrics bool) *graf.GraphPanel { + template := `sum(%s{instance=~"$node",shard=~"$node_shard"}) by ($aggr_criteria)` + if isPublicMetrics { + template = `sum(%s{instance=~"$node"}) by ($aggr_criteria)` + } target := graf.Target{ - Expr: expr, + Expr: fmt.Sprintf(template, m.GetName()), LegendFormat: legendFormat(m), Format: "time_series", Step: 10, diff --git a/src/go/rpk/pkg/cli/cmd/generate/grafana_test.go b/src/go/rpk/pkg/cli/cmd/generate/grafana_test.go index d8e4fe07dd9c9..d96701a35d05a 100644 --- a/src/go/rpk/pkg/cli/cmd/generate/grafana_test.go +++ b/src/go/rpk/pkg/cli/cmd/generate/grafana_test.go @@ -65,7 +65,7 @@ vectorized_vectorized_internal_rpc_dispatch_handler_latency_bucket{le="20.000000 vectorized_memory_allocated_memory_bytes{shard="0",type="bytes"} 40837120 vectorized_memory_allocated_memory_bytes{shard="1",type="bytes"} 36986880 ` - expected := `{"title":"Redpanda","templating":{"list":[{"name":"node","datasource":"prometheus","label":"Node","type":"query","refresh":1,"options":[],"includeAll":true,"allFormat":"","allValue":".*","multi":true,"multiFormat":"","query":"label_values(instance)","current":{"text":"","value":null},"hide":0,"sort":1},{"name":"node_shard","datasource":"prometheus","label":"Shard","type":"query","refresh":1,"options":[],"includeAll":true,"allFormat":"","allValue":".*","multi":true,"multiFormat":"","query":"label_values(shard)","current":{"text":"","value":null},"hide":0,"sort":1},{"name":"aggr_criteria","datasource":"prometheus","label":"Aggregate by","type":"custom","refresh":1,"options":[{"text":"Cluster","value":"","selected":false},{"text":"Instance","value":"instance,","selected":false},{"text":"Instance, Shard","value":"instance,shard,","selected":false}],"includeAll":false,"allFormat":"","allValue":"","multi":false,"multiFormat":"","query":"Cluster : cluster,Instance : instance,Instance\\,Shard : instance\\,shard","current":{"text":"Cluster","value":""},"hide":0,"sort":1}]},"panels":[{"type":"text","id":1,"title":"","editable":true,"gridPos":{"h":2,"w":24,"x":0,"y":0},"transparent":true,"links":null,"span":1,"error":false,"content":"

Redpanda Summary

","mode":"html"},{"type":"singlestat","id":2,"title":"Nodes Up","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":2,"x":0,"y":2},"transparent":true,"span":1,"error":false,"targets":[{"refId":"","expr":"count by (app) (vectorized_application_uptime)","intervalFactor":1,"step":40,"legendFormat":"Nodes Up"}],"format":"none","prefix":"","postfix":"","maxDataPoints":100,"valueMaps":[{"value":"null","op":"=","text":"N/A"}],"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"nullPointMode":"connected","valueName":"current","valueFontSize":"200%","prefixFontSize":"50%","postfixFontSize":"50%","colorBackground":false,"colorValue":true,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"thresholds":"","sparkline":{"show":false,"full":false,"ymin":null,"ymax":null,"lineColor":"rgb(31, 120, 193)","fillColor":"rgba(31, 118, 189, 0.18)"},"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false},"links":[],"interval":null,"timeFrom":null,"timeShift":null,"nullText":null,"cacheTimeout":null,"tableColumn":""},{"type":"singlestat","id":3,"title":"Partitions","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":2,"x":2,"y":8},"transparent":true,"span":1,"error":false,"targets":[{"refId":"","expr":"count(count by (topic,partition) (vectorized_storage_log_partition_size{namespace=\"kafka\"}))","legendFormat":"Partition count"}],"format":"none","prefix":"","postfix":"","maxDataPoints":100,"valueMaps":[{"value":"null","op":"=","text":"N/A"}],"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"nullPointMode":"connected","valueName":"current","valueFontSize":"200%","prefixFontSize":"50%","postfixFontSize":"50%","colorBackground":false,"colorValue":true,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"thresholds":"","sparkline":{"show":false,"full":false,"ymin":null,"ymax":null,"lineColor":"rgb(31, 120, 193)","fillColor":"rgba(31, 118, 189, 0.18)"},"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false},"links":[],"interval":null,"timeFrom":null,"timeShift":null,"nullText":null,"cacheTimeout":null,"tableColumn":""},{"type":"text","id":5,"title":"","editable":true,"gridPos":{"h":2,"w":12,"x":12,"y":14},"transparent":true,"links":null,"span":1,"error":false,"content":"

Throughput

","mode":"html"},{"type":"row","collapsed":true,"id":7,"title":"memory","editable":true,"gridPos":{"h":6,"w":24,"x":0,"y":20},"transparent":false,"links":null,"span":0,"error":false,"panels":[{"type":"graph","id":6,"title":"Rate - Allocated memory size in bytes","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":0,"y":20},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"","expr":"sum(irate(vectorized_memory_allocated_memory_bytes{instance=~\"$node\",shard=~\"$node_shard\"}[2m])) by ($aggr_criteria)","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"Bps"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"cumulative","msResolution":true},"aliasColors":{},"steppedLine":false}]},{"type":"row","collapsed":true,"id":9,"title":"vectorized_internal_rpc","editable":true,"gridPos":{"h":6,"w":24,"x":0,"y":21},"transparent":false,"links":null,"span":0,"error":false,"panels":[{"type":"graph","id":8,"title":"Amount of memory consumed for requests processing","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":0,"y":21},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"","expr":"sum(vectorized_vectorized_internal_rpc_consumed_mem{instance=~\"$node\",shard=~\"$node_shard\"}) by ($aggr_criteria)","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"short"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"cumulative","msResolution":true},"aliasColors":{},"steppedLine":true},{"type":"graph","id":10,"title":"Rate - Number of requests with corrupted headers","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":8,"y":21},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"","expr":"sum(irate(vectorized_vectorized_internal_rpc_corrupted_headers{instance=~\"$node\",shard=~\"$node_shard\"}[2m])) by ($aggr_criteria)","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"ops"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"cumulative","msResolution":true},"aliasColors":{},"steppedLine":false},{"type":"graph","id":11,"title":"Latency of service handler dispatch (p95)","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":16,"y":21},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"A","expr":"histogram_quantile(0.95, sum(rate(vectorized_vectorized_internal_rpc_dispatch_handler_latency_bucket{instance=~\"$node\",shard=~\"$node_shard\"}[2m])) by (le, $aggr_criteria))","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"µs"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null as zero","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"individual","msResolution":true},"aliasColors":{},"steppedLine":true}]}],"editable":true,"timezone":"utc","refresh":"10s","time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"annotations":{"list":null},"links":null,"schemaVersion":12}` + expected := `{"title":"Redpanda","templating":{"list":[{"name":"node","datasource":"prometheus","label":"Node","type":"query","refresh":1,"options":[],"includeAll":true,"allFormat":"","allValue":".*","multi":true,"multiFormat":"","query":"label_values(instance)","current":{"text":"","value":null},"hide":0,"sort":1},{"name":"node_shard","datasource":"prometheus","label":"Shard","type":"query","refresh":1,"options":[],"includeAll":true,"allFormat":"","allValue":".*","multi":true,"multiFormat":"","query":"label_values(shard)","current":{"text":"","value":null},"hide":0,"sort":1},{"name":"aggr_criteria","datasource":"prometheus","label":"Aggregate by","type":"custom","refresh":1,"options":[{"text":"Cluster","value":"","selected":false},{"text":"Instance","value":"instance,","selected":false},{"text":"Instance, Shard","value":"instance,shard,","selected":false}],"includeAll":false,"allFormat":"","allValue":"","multi":false,"multiFormat":"","query":"Cluster : cluster,Instance : instance,Instance\\,Shard : instance\\,shard","current":{"text":"Cluster","value":""},"hide":0,"sort":1}]},"panels":[{"type":"text","id":1,"title":"","editable":true,"gridPos":{"h":2,"w":24,"x":0,"y":0},"transparent":true,"links":null,"span":1,"error":false,"content":"

Redpanda Summary

","mode":"html"},{"type":"singlestat","id":2,"title":"Nodes Up","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":2,"x":0,"y":2},"transparent":true,"span":1,"error":false,"targets":[{"refId":"","expr":"count by (app) (vectorized_application_uptime)","intervalFactor":1,"step":40,"legendFormat":"Nodes Up"}],"format":"none","prefix":"","postfix":"","maxDataPoints":100,"valueMaps":[{"value":"null","op":"=","text":"N/A"}],"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"nullPointMode":"connected","valueName":"current","valueFontSize":"200%","prefixFontSize":"50%","postfixFontSize":"50%","colorBackground":false,"colorValue":true,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"thresholds":"","sparkline":{"show":false,"full":false,"ymin":null,"ymax":null,"lineColor":"rgb(31, 120, 193)","fillColor":"rgba(31, 118, 189, 0.18)"},"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false},"links":[],"interval":null,"timeFrom":null,"timeShift":null,"nullText":null,"cacheTimeout":null,"tableColumn":""},{"type":"singlestat","id":3,"title":"Partitions","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":2,"x":2,"y":8},"transparent":true,"span":1,"error":false,"targets":[{"refId":"","expr":"count(count by (topic,partition) (vectorized_storage_log_partition_size{namespace=\"kafka\"}))","legendFormat":"Partition count"}],"format":"none","prefix":"","postfix":"","maxDataPoints":100,"valueMaps":[{"value":"null","op":"=","text":"N/A"}],"mappingTypes":[{"name":"value to text","value":1},{"name":"range to text","value":2}],"rangeMaps":[{"from":"null","to":"null","text":"N/A"}],"mappingType":1,"nullPointMode":"connected","valueName":"current","valueFontSize":"200%","prefixFontSize":"50%","postfixFontSize":"50%","colorBackground":false,"colorValue":true,"colors":["#299c46","rgba(237, 129, 40, 0.89)","#d44a3a"],"thresholds":"","sparkline":{"show":false,"full":false,"ymin":null,"ymax":null,"lineColor":"rgb(31, 120, 193)","fillColor":"rgba(31, 118, 189, 0.18)"},"gauge":{"show":false,"minValue":0,"maxValue":100,"thresholdMarkers":true,"thresholdLabels":false},"links":[],"interval":null,"timeFrom":null,"timeShift":null,"nullText":null,"cacheTimeout":null,"tableColumn":""},{"type":"text","id":5,"title":"","editable":true,"gridPos":{"h":2,"w":12,"x":12,"y":14},"transparent":true,"links":null,"span":1,"error":false,"content":"

Throughput

","mode":"html"},{"type":"row","collapsed":true,"id":7,"title":"memory","editable":true,"gridPos":{"h":6,"w":24,"x":0,"y":20},"transparent":false,"links":null,"span":0,"error":false,"panels":[{"type":"graph","id":6,"interval":"1m","title":"Rate - Allocated memory size in bytes","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":0,"y":20},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"","expr":"sum(irate(vectorized_memory_allocated_memory_bytes{instance=~\"$node\",shard=~\"$node_shard\"}[2m])) by ($aggr_criteria)","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"Bps"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"cumulative","msResolution":true},"aliasColors":{},"steppedLine":false}]},{"type":"row","collapsed":true,"id":9,"title":"vectorized_internal_rpc","editable":true,"gridPos":{"h":6,"w":24,"x":0,"y":21},"transparent":false,"links":null,"span":0,"error":false,"panels":[{"type":"graph","id":8,"title":"Amount of memory consumed for requests processing","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":0,"y":21},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"","expr":"sum(vectorized_vectorized_internal_rpc_consumed_mem{instance=~\"$node\",shard=~\"$node_shard\"}) by ($aggr_criteria)","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"short"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"cumulative","msResolution":true},"aliasColors":{},"steppedLine":true},{"type":"graph","id":10,"interval":"1m","title":"Rate - Number of requests with corrupted headers","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":8,"y":21},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"","expr":"sum(irate(vectorized_vectorized_internal_rpc_corrupted_headers{instance=~\"$node\",shard=~\"$node_shard\"}[2m])) by ($aggr_criteria)","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"ops"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"cumulative","msResolution":true},"aliasColors":{},"steppedLine":false},{"type":"graph","id":11,"interval":"1m","title":"Latency of service handler dispatch (p95)","datasource":"prometheus","editable":true,"gridPos":{"h":6,"w":8,"x":16,"y":21},"transparent":false,"links":null,"renderer":"flot","span":4,"error":false,"targets":[{"refId":"A","expr":"histogram_quantile(0.95, sum(rate(vectorized_vectorized_internal_rpc_dispatch_handler_latency_bucket{instance=~\"$node\",shard=~\"$node_shard\"}[2m])) by (le, $aggr_criteria))","intervalFactor":2,"step":10,"legendFormat":"node: {{instance}}, shard: {{shard}}","format":"time_series"}],"xaxis":{"format":"","logBase":0,"show":true,"mode":"time"},"yaxes":[{"label":null,"show":true,"logBase":1,"min":0,"format":"µs"},{"label":null,"show":true,"logBase":1,"min":0,"format":"short"}],"legend":{"show":true,"max":false,"min":false,"values":false,"avg":false,"current":false,"total":false},"fill":1,"linewidth":2,"nullPointMode":"null as zero","thresholds":null,"lines":true,"bars":false,"tooltip":{"shared":true,"value_type":"individual","msResolution":true},"aliasColors":{},"steppedLine":true}]}],"editable":true,"timezone":"utc","refresh":"10s","time":{"from":"now-1h","to":"now"},"timepicker":{"refresh_intervals":["5s","10s","30s","1m","5m","15m","30m","1h","2h","1d"],"time_options":["5m","15m","1h","6h","12h","24h","2d","7d","30d"]},"annotations":{"list":null},"links":null,"schemaVersion":12}` ts := httptest.NewServer( http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK)