Skip to content

Commit 4c6512d

Browse files
committed
some WIP longhorn stuff
1 parent 8f0a9e3 commit 4c6512d

9 files changed

+191
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
3+
apiVersion: kustomize.config.k8s.io/v1beta1
4+
kind: Kustomization
5+
namespace: storage
6+
resources:
7+
- monitoring
8+
# - recurringjobs
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
3+
apiVersion: kustomize.config.k8s.io/v1beta1
4+
kind: Kustomization
5+
namespace: longhorn-system
6+
resources:
7+
- prometheusrule.yaml
8+
- servicemonitor.yaml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
labels:
6+
prometheus: longhorn
7+
role: alert-rules
8+
name: prometheus-longhorn-rules
9+
spec:
10+
groups:
11+
- name: longhorn.rules
12+
rules:
13+
- alert: LonghornVolumeActualSpaceUsedWarning
14+
annotations:
15+
description: The actual space used by Longhorn volume {{$labels.volume}}
16+
on {{$labels.node}} is at {{$value}}% capacity for more than 5 minutes.
17+
summary: The actual used space of Longhorn volume is over 90% of the capacity.
18+
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes)
19+
* 100 > 90
20+
for: 5m
21+
labels:
22+
issue: The actual used space of Longhorn volume {{$labels.volume}} on
23+
{{$labels.node}} is high.
24+
severity: warning
25+
- alert: LonghornVolumeStatusCritical
26+
annotations:
27+
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is
28+
Fault for more than 2 minutes.
29+
summary: Longhorn volume {{$labels.volume}} is Fault
30+
expr: longhorn_volume_robustness == 3
31+
for: 5m
32+
labels:
33+
issue: Longhorn volume {{$labels.volume}} is Fault.
34+
severity: critical
35+
- alert: LonghornVolumeStatusWarning
36+
annotations:
37+
description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is
38+
Degraded for more than 5 minutes.
39+
summary: Longhorn volume {{$labels.volume}} is Degraded
40+
expr: longhorn_volume_robustness == 2
41+
for: 5m
42+
labels:
43+
issue: Longhorn volume {{$labels.volume}} is Degraded.
44+
severity: warning
45+
- alert: LonghornNodeStorageWarning
46+
annotations:
47+
description: The used storage of node {{$labels.node}} is at {{$value}}%
48+
capacity for more than 5 minutes.
49+
summary: The used storage of node is over 70% of the capacity.
50+
expr: (longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes)
51+
* 100 > 70
52+
for: 5m
53+
labels:
54+
issue: The used storage of node {{$labels.node}} is high.
55+
severity: warning
56+
- alert: LonghornDiskStorageWarning
57+
annotations:
58+
description: The used storage of disk {{$labels.disk}} on node {{$labels.node}}
59+
is at {{$value}}% capacity for more than 5 minutes.
60+
summary: The used storage of disk is over 70% of the capacity.
61+
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100 >
62+
70
63+
for: 5m
64+
labels:
65+
issue: The used storage of disk {{$labels.disk}} on node {{$labels.node}}
66+
is high.
67+
severity: warning
68+
- alert: LonghornNodeDown
69+
annotations:
70+
description: There are {{$value}} Longhorn nodes which have been offline
71+
for more than 5 minutes.
72+
summary: Longhorn nodes is offline
73+
expr: longhorn_node_total - (count(longhorn_node_status{condition="ready"}==1)
74+
OR on() vector(0))
75+
for: 5m
76+
labels:
77+
issue: There are {{$value}} Longhorn nodes are offline
78+
severity: critical
79+
- alert: LonghornIntanceManagerCPUUsageWarning
80+
annotations:
81+
description: Longhorn instance manager {{$labels.instance_manager}} on
82+
{{$labels.node}} has CPU Usage / CPU request is {{$value}}% for more
83+
than 5 minutes.
84+
summary: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}}
85+
has CPU Usage / CPU request is over 300%.
86+
expr: (longhorn_instance_manager_cpu_usage_millicpu/longhorn_instance_manager_cpu_requests_millicpu)
87+
* 100 > 300
88+
for: 5m
89+
labels:
90+
issue: Longhorn instance manager {{$labels.instance_manager}} on {{$labels.node}}
91+
consumes 3 times the CPU request.
92+
severity: warning
93+
- alert: LonghornNodeCPUUsageWarning
94+
annotations:
95+
description: Longhorn node {{$labels.node}} has CPU Usage / CPU capacity
96+
is {{$value}}% for more than 5 minutes.
97+
summary: Longhorn node {{$labels.node}} experiences high CPU pressure
98+
for more than 5m.
99+
expr: (longhorn_node_cpu_usage_millicpu / longhorn_node_cpu_capacity_millicpu)
100+
* 100 > 90
101+
for: 5m
102+
labels:
103+
issue: Longhorn node {{$labels.node}} experiences high CPU pressure.
104+
severity: warning
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: ServiceMonitor
4+
metadata:
5+
name: longhorn-prometheus-servicemonitor
6+
labels:
7+
name: longhorn-prometheus-servicemonitor
8+
spec:
9+
selector:
10+
matchLabels:
11+
app: longhorn-manager
12+
namespaceSelector:
13+
matchNames:
14+
- longhorn-system
15+
endpoints:
16+
- port: manager
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: longhorn.io/v1beta1
3+
kind: RecurringJob
4+
metadata:
5+
name: 30min-snapshot
6+
namespace: longhorn-system
7+
spec:
8+
concurrency: 2
9+
cron: 0/30 * * * *
10+
groups:
11+
- normal
12+
retain: 4
13+
task: snapshot
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: longhorn.io/v1beta1
3+
kind: RecurringJob
4+
metadata:
5+
name: daily-backup
6+
namespace: longhorn-system
7+
spec:
8+
concurrency: 2
9+
cron: 45 0 * * *
10+
groups:
11+
- normal
12+
retain: 7
13+
task: backup
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: longhorn.io/v1beta1
3+
kind: RecurringJob
4+
metadata:
5+
name: hourly-backup
6+
namespace: longhorn-system
7+
spec:
8+
concurrency: 2
9+
cron: 15 * * * *
10+
groups:
11+
- normal
12+
retain: 6
13+
task: backup
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
3+
apiVersion: kustomize.config.k8s.io/v1beta1
4+
kind: Kustomization
5+
namespace: longhorn-system
6+
resources:
7+
- 30min-snapshot.yaml
8+
- daily-backup.yaml
9+
- hourly-backup.yaml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
kind: VolumeSnapshotClass
3+
apiVersion: snapshot.storage.k8s.io/v1
4+
metadata:
5+
name: longhorn
6+
driver: driver.longhorn.io
7+
deletionPolicy: Delete

0 commit comments

Comments
 (0)