Skip to content

Commit d1d8047

Browse files
dashboard: panels with CPU and memory utilization
This patch adds `CPU/memory/virtual memory` utilization panels per instance and total. Closes #TNTP-4365
1 parent 3cc409e commit d1d8047

17 files changed

+12518
-2981
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- Panel with Сartridge configuration checksum (#242)
1111
- Panel with `need schema upgrade` status (#243)
12-
12+
- Panels with `CPU/memory/virtual memory` utilization per instance and total (#245)
1313

1414
## [3.2.1] - 2024-12-06
1515
Grafana revisions:

dashboard/panels/cpu.libsonnet

Lines changed: 193 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
local grafana = import 'grafonnet/grafana.libsonnet';
22

33
local common = import 'dashboard/panels/common.libsonnet';
4+
local common_utils = import 'dashboard/panels/common.libsonnet';
45
local variable = import 'dashboard/variable.libsonnet';
56

67
local influxdb = grafana.influxdb;
@@ -21,14 +22,14 @@ local prometheus = grafana.prometheus;
2122
format='percentunit',
2223
decimalsY1=0,
2324
min=0,
24-
panel_width=12,
25+
panel_width=8,
2526
).addTarget(
2627
common.target(cfg, metric_name, rate=true)
2728
),
2829

29-
getrusage_cpu_user_time(
30+
getrusage_cpu_instance_user_time(
3031
cfg,
31-
title='CPU user time',
32+
title='CPU user time per instance',
3233
description=|||
3334
This is the average share of time
3435
spent by instance process executing in user mode.
@@ -43,9 +44,9 @@ local prometheus = grafana.prometheus;
4344
metric_name='tnt_cpu_user_time',
4445
),
4546

46-
getrusage_cpu_system_time(
47+
getrusage_cpu_instance_system_time(
4748
cfg,
48-
title='CPU system time',
49+
title='CPU system time per instance',
4950
description=|||
5051
This is the average share of time
5152
spent by instance process executing in kernel mode.
@@ -60,6 +61,193 @@ local prometheus = grafana.prometheus;
6061
metric_name='tnt_cpu_system_time',
6162
),
6263

64+
// --------------------------------------------------------------------------
65+
local getrusage_cpu_total_percentage_graph(
66+
cfg, title, description,
67+
) = common.default_graph(
68+
cfg,
69+
title=title,
70+
description=description,
71+
format='percentunit',
72+
decimalsY1=0,
73+
min=0,
74+
panel_width=8,
75+
).addTarget(
76+
if cfg.type == variable.datasource_type.prometheus then
77+
prometheus.target(
78+
expr=std.format(
79+
|||
80+
rate(%(metrics_prefix)stnt_cpu_user_time{%(filters)s}[$__rate_interval]) +
81+
rate(%(metrics_prefix)stnt_cpu_system_time{%(filters)s}[$__rate_interval])
82+
|||,
83+
{
84+
metrics_prefix: cfg.metrics_prefix,
85+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
86+
}
87+
),
88+
legendFormat='{{alias}}'
89+
)
90+
else if cfg.type == variable.datasource_type.influxdb then
91+
influxdb.target(
92+
rawQuery=true,
93+
query=|||
94+
SELECT non_negative_derivative(sum("value"), 1s)
95+
FROM "$policy"."$measurement"
96+
WHERE (("metric_name" = 'tnt_cpu_user_time' OR "metric_name" = 'tnt_cpu_system_time') AND "label_pairs_alias" =~ /^$alias$/)
97+
AND $timeFilter
98+
GROUP BY time($__interval), "label_pairs_alias" fill(none)
99+
|||,
100+
alias='$tag_label_pairs_alias',
101+
)
102+
),
103+
104+
getrusage_cpu_instance_total_time(
105+
cfg,
106+
title='CPU total time per instance',
107+
description=|||
108+
This is the average share of time spent
109+
by instance process executing.
110+
111+
Panel minimal requirements: metrics 0.8.0.
112+
|||,
113+
):: getrusage_cpu_total_percentage_graph(
114+
cfg=cfg,
115+
title=title,
116+
description=description,
117+
),
118+
119+
// --------------------------------------------------------------------------
120+
local getrusage_cpu_common_percentage_graph(
121+
cfg,
122+
title,
123+
description,
124+
prometheus_expr,
125+
prometheus_legend,
126+
influx_query,
127+
influx_alias,
128+
) = common.default_graph(
129+
cfg,
130+
title=title,
131+
description=description,
132+
format='percentunit',
133+
decimalsY1=0,
134+
min=0,
135+
panel_width=8,
136+
).addTarget(
137+
if cfg.type == variable.datasource_type.prometheus then
138+
prometheus.target(
139+
expr=prometheus_expr,
140+
legendFormat=prometheus_legend,
141+
)
142+
else if cfg.type == variable.datasource_type.influxdb then
143+
influxdb.target(
144+
rawQuery=true,
145+
query=influx_query,
146+
alias=influx_alias,
147+
)
148+
),
149+
150+
getrusage_cpu_total_time(
151+
cfg,
152+
title='CPU total time per cluster',
153+
description=|||
154+
This is the total share of time spent
155+
by each cluster process executing.
156+
157+
Panel minimal requirements: metrics 0.8.0.
158+
|||,
159+
):: getrusage_cpu_common_percentage_graph(
160+
cfg=cfg,
161+
title=title,
162+
description=description,
163+
prometheus_expr=std.format(
164+
|||
165+
sum(rate(%(metrics_prefix)stnt_cpu_user_time{%(filters)s}[$__rate_interval])) +
166+
sum(rate(%(metrics_prefix)stnt_cpu_system_time{%(filters)s}[$__rate_interval]))
167+
|||,
168+
{
169+
metrics_prefix: cfg.metrics_prefix,
170+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
171+
}
172+
),
173+
prometheus_legend=title,
174+
influx_query=|||
175+
SELECT non_negative_derivative(SUM("value"), 1s) AS total_cpu_time_per_cluster
176+
FROM "$policy"."$measurement"
177+
WHERE ("metric_name" = 'tnt_cpu_user_time' OR "metric_name" = 'tnt_cpu_system_time')
178+
AND $timeFilter
179+
GROUP BY time($__interval)
180+
|||,
181+
influx_alias=title
182+
),
183+
184+
getrusage_cpu_total_user_time(
185+
cfg,
186+
title='CPU total user time per cluster',
187+
description=|||
188+
This is the total share of time
189+
spent in user mode per cluster.
190+
191+
Panel minimal requirements: metrics 0.8.0.
192+
|||,
193+
):: getrusage_cpu_common_percentage_graph(
194+
cfg=cfg,
195+
title=title,
196+
description=description,
197+
prometheus_expr=std.format(
198+
|||
199+
sum(rate(%(metrics_prefix)stnt_cpu_user_time{%(filters)s}[$__rate_interval]))
200+
|||,
201+
{
202+
metrics_prefix: cfg.metrics_prefix,
203+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
204+
}
205+
),
206+
prometheus_legend=title,
207+
influx_query=|||
208+
SELECT non_negative_derivative(SUM("value"), 1s) AS total_cpu_user_time
209+
FROM "$policy"."$measurement"
210+
WHERE "metric_name" = 'tnt_cpu_user_time' AND "label_pairs_alias" =~ /^$alias$/
211+
AND $timeFilter
212+
GROUP BY time($__interval)
213+
|||,
214+
influx_alias=title
215+
),
216+
217+
getrusage_cpu_total_system_time(
218+
cfg,
219+
title='CPU total system time per cluster',
220+
description=|||
221+
This is the total share of time
222+
spent in system mode per cluster.
223+
224+
Panel minimal requirements: metrics 0.8.0.
225+
|||,
226+
):: getrusage_cpu_common_percentage_graph(
227+
cfg=cfg,
228+
title=title,
229+
description=description,
230+
prometheus_expr=std.format(
231+
|||
232+
sum(rate(%(metrics_prefix)stnt_cpu_system_time{%(filters)s}[$__rate_interval]))
233+
|||,
234+
{
235+
metrics_prefix: cfg.metrics_prefix,
236+
filters: common.prometheus_query_filters(common.remove_field(cfg.filters, 'alias')),
237+
}
238+
),
239+
prometheus_legend=title,
240+
influx_query=|||
241+
SELECT non_negative_derivative(SUM("value"), 1s) AS total_cpu_system_time
242+
FROM "$policy"."$measurement"
243+
WHERE ("metric_name" = 'tnt_cpu_system_time' AND "label_pairs_alias" =~ /^$alias$/)
244+
AND $timeFilter
245+
GROUP BY time($__interval)
246+
|||,
247+
influx_alias=title
248+
),
249+
250+
// --------------------------------------------------------------------------
63251
local procstat_thread_time_graph(
64252
cfg,
65253
title,

0 commit comments

Comments
 (0)