Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mixin/alerts.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(import './mixin.libsonnet').prometheusAlerts
25 changes: 25 additions & 0 deletions mixin/alerts/add-runbook-links.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// source: https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/lib/add-runbook-links.libsonnet
local utils = import '../lib/utils.libsonnet';


local lower(x) =
local cp(c) = std.codepoint(c);
local lowerLetter(c) =
if cp(c) >= 65 && cp(c) < 91
then std.char(cp(c) + 32)
else c;
std.join('', std.map(lowerLetter, std.stringChars(x)));

{
_config+:: {
runbookURLPattern: error 'must provide runbookURLPattern',
},

prometheusAlerts+::
local addRunbookURL(rule) = rule {
[if 'alert' in rule && !('runbook_url' in rule.annotations) then 'annotations']+: {
runbook_url: $._config.runbookURLPattern % lower(rule.alert),
},
};
utils.mapRuleGroups(addRunbookURL),
}
2 changes: 2 additions & 0 deletions mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
(import './cassandra.libsonnet') +
(import './add-runbook-links.libsonnet')
402 changes: 402 additions & 0 deletions mixin/alerts/cassandra.libsonnet

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
_config+:: {
// mcacSelector is inserted as part of the label selector in
// PromQL queries to identify metrics collected from Cassandra
// servers.
// With the Kubernetes cass-operator the following selector can be used
// cassandraSelector: 'cassandra_datastax_com_cluster!="", cassandra_datastax_com_datacenter!=""',
// TODO: use in all dashboard queries
cassandraSelector: 'cluster!="", dc!=""',

// dimensions is a way to help mixin users to add high level target grouping to their alerts and dashboards.
// With the help of dimensions you can use a single observability stack to monitor several Cassandra clusters.
// Each label of the list will be used as in the alerts to define aggregations (in `by ()`) and in the dashboards to define variables.
// Inspired from: https://github.com/thanos-io/thanos/blob/v0.25.2/mixin/config.libsonnet
// TODO: use in dashboards variable templates
dimensions: ['cluster', 'dc'],

// runbookURLPattern is used to create a `runbook_url` annotation for each alert
runbookURLPattern: 'https://github.com/datastax/metric-collector-for-apache-cassandra/tree/master/mixin/runbook.md#alert-name-%s',
},
}
14 changes: 14 additions & 0 deletions mixin/lib/utils.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// source: https://github.com/kubernetes-monitoring/kubernetes-mixin/blob/master/lib/utils.libsonnet
{
mapRuleGroups(f): {
groups: [
group {
rules: [
f(rule)
for rule in super.rules
],
}
for group in super.groups
],
},
}
7 changes: 7 additions & 0 deletions mixin/make-alerts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

docker run --rm -v "${ROOT_DIR}:${ROOT_DIR}" datastax/grafonnet-lib:v0.1.3 \
jsonnet "${ROOT_DIR}/mixin/alerts.jsonnet"
2 changes: 2 additions & 0 deletions mixin/mixin.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
(import './alerts/alerts.libsonnet') +
(import './config.libsonnet')
158 changes: 158 additions & 0 deletions mixin/runbook.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Casssandra Alerts runbooks

## Group Name: "metric-collector-for-apache-cassandra"

### Alert Name: "CassandraReadLatencyHigh"

*Summary*: `Cassandra has high latency for read requests.`
*Severity*: warning

### Alert Name: "CassandraWriteLatencyHigh"

*Summary*: `Cassandra has high latency for write requests.`
*Severity*: warning

### Alert Name: "CassandraTableReadLatencyHigh"

*Summary*: `Cassandra table has high latency for read requests.`
*Severity*: warning

### Alert Name: "CassandraTableWriteLatencyHigh"

*Summary*: `Cassandra table has high latency for write requests.`
*Severity*: warning

### Alert Name: "CassandraCrossNodeLatencyHigh"

*Summary*: `Cassandra has high internode latency.`
*Severity*: warning

### Alert Name: "CassandraDatacenterLatencyHigh"

*Summary*: `Cassandra has high datacenter latency.`
*Severity*: warning

### Alert Name: "CassandraPendingCompactionTasks"

*Summary*: `Cassandra has pending compaction tasks.`
*Severity*: warning

### Alert Name: "CassandraTableDroppedMutations"

*Summary*: `Cassandra table has dropped mutations.`
*Severity*: warning

### Alert Name: "CassandraTablePartitionSizeLarge"

*Summary*: `Cassandra table has large partitions.`
*Severity*: warning
*Action*: Alert development team, as this indicates problems with the data model.

### Alert Name: "CassandraTableLiveSSTableCountHigh"

*Summary*: `Cassandra table has a high count of SSTables.`
*Severity*: warning
*Action*: Too many big tables, which will lead to performance degradation.

### Alert Name: "CassandraStoredHintsHigh"

*Summary*: `Cassandra stored hints is high.`
*Severity*: warning

### Alert Name: "CassandraHintsReplayFailed"

*Summary*: `Cassandra has failed hint replays.`
*Severity*: warning

### Alert Name: "CassandraHintsReplayTimedOut"

*Summary*: `Cassandra has failed hints replays.`
*Severity*: warning

### Alert Name: "CassandraMemtableBlockedOnAllocation"

*Summary*: `Cassandra Memtable has blocked on allocation.`
*Severity*: warning

### Alert Name: "CassandraMemtableBlockedFlushWriterTasks"

*Summary*: `Cassandra Memtable has blocked flush writer tasks.`
*Severity*: warning
*Action*: Investigate. This condition caused by failing disks, excessive disk operations, and so on.

### Alert Name: "CassandraMemtableBlockedCompactorExecutorTasks"

*Summary*: `Cassandra Memtable has blocked compactor exectutor tasks.`
*Severity*: warning

### Alert Name: "CassandraAbortedCompactionTasks"

*Summary*: `Cassandra has aborted compaction tasks.`
*Severity*: warning

### Alert Name: "CassandraJVMGarbageCollectionTimeHigh"

*Summary*: `Cassandra has high JVM garbage collection time`
*Severity*: warning

### Alert Name: "CassandraSegmentsWaitingOnCommit"

*Summary*: `Cassandra has segments waiting on commits.`
*Severity*: warning
*Action*: High count during last minute.

### Alert Name: "CassandraSegmentsWaitingOnCommitDurationHigh"

*Summary*: `Cassandra has spent a long time waiting on commits.`
*Severity*: warning

### Alert Name: "CassandraTablePendingFlushes"

*Summary*: `Cassandra table has pending flushes.`
*Severity*: warning

### Alert Name: "CassandraTableKeyCacheHitRateLow"

*Summary*: `Cassandra table has a low key cache hit rate.`
*Severity*: warning
*Action*: If the cache is full (capacity is equal to size), increase the size of the key cache.

### Alert Name: "CassandraTargetUnreachable"

*Summary*: `Prometheus cannot scrape metrics from Cassandra nodes.`
*Severity*: warning

### Alert Name: "CassandraTargetTooManyUnreachable"

*Summary*: `Prometheus cannot scrape metrics from Cassandra nodes`
*Severity*: critical

### Alert Name: "CassandraCPUUsageHigh"

*Summary*: `Cassandra has high CPU usage.`
*Severity*: warning

### Alert Name: "CassandraLoadHigh"

*Summary*: `Cassandra has high CPU usage.`
*Severity*: warning

### Alert Name: "CassandraDiskUsageHigh"

*Summary*: `Cassandra has high disk usage.`
*Severity*: warning

### Alert Name: "CassandraTableSSTablesPerReadHigh"

*Summary*: `Cassandra table has high number of SSTables accessed per read.`
*Severity*: warning

### Alert Name: "CassandraTableTombstonesPerReadHigh"

*Summary*: `Cassandra table has high number of tombstones scanned per read.`
*Severity*: warning

### Alert Name: "CassandraTablePartitionSize99thPercentileLarge"

*Summary*: `Cassandra table has large partitions.`
*Severity*: warning