Skip to content

Commit 336bbea

Browse files
fix(metric): update committed epoch metrics (#19959) (#19961)
Co-authored-by: zwang28 <[email protected]>
1 parent d2640de commit 336bbea

8 files changed

+35
-198
lines changed

docker/dashboards/risingwave-dev-dashboard.json

+1-1
Large diffs are not rendered by default.

docker/dashboards/risingwave-user-dashboard.json

+1-1
Large diffs are not rendered by default.

grafana/risingwave-dev-dashboard.dashboard.py

+2-180
Original file line numberDiff line numberDiff line change
@@ -3327,9 +3327,9 @@ def section_hummock_manager(outer_panels):
33273327
f"{metric('storage_max_committed_epoch')}",
33283328
"max committed epoch",
33293329
),
3330-
panels.target(f"{metric('storage_safe_epoch')}", "safe epoch"),
33313330
panels.target(
3332-
f"{metric('storage_min_pinned_epoch')}", "min pinned epoch"
3331+
f"{metric('storage_min_committed_epoch')}",
3332+
"min committed epoch",
33333333
),
33343334
],
33353335
),
@@ -3676,182 +3676,6 @@ def section_grpc_meta_stream_manager(outer_panels):
36763676
),
36773677
]
36783678

3679-
3680-
def section_grpc_meta_hummock_manager(outer_panels):
3681-
panels = outer_panels.sub_panel()
3682-
return [
3683-
outer_panels.row_collapsed(
3684-
"gRPC Meta: Hummock Manager",
3685-
[
3686-
grpc_metrics_target(
3687-
panels,
3688-
"UnpinVersionBefore",
3689-
"path='/meta.HummockManagerService/UnpinVersionBefore'",
3690-
),
3691-
grpc_metrics_target(
3692-
panels,
3693-
"ReportCompactionTasks",
3694-
"path='/meta.HummockManagerService/ReportCompactionTasks'",
3695-
),
3696-
grpc_metrics_target(
3697-
panels,
3698-
"GetNewSstIds",
3699-
"path='/meta.HummockManagerService/GetNewSstIds'",
3700-
),
3701-
],
3702-
),
3703-
]
3704-
3705-
3706-
def section_grpc_hummock_meta_client(outer_panels):
3707-
panels = outer_panels.sub_panel()
3708-
return [
3709-
outer_panels.row_collapsed(
3710-
"gRPC: Hummock Meta Client",
3711-
[
3712-
panels.timeseries_count(
3713-
"compaction_count",
3714-
"",
3715-
[
3716-
panels.target(
3717-
f"sum(irate({metric('state_store_report_compaction_task_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
3718-
"report_compaction_task_counts - {{%s}}" % NODE_LABEL,
3719-
),
3720-
],
3721-
),
3722-
panels.timeseries_latency(
3723-
"version_latency",
3724-
"",
3725-
[
3726-
panels.target(
3727-
f"histogram_quantile(0.5, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3728-
"unpin_version_before_latency_p50 - {{%s}}" % NODE_LABEL,
3729-
),
3730-
panels.target(
3731-
f"histogram_quantile(0.99, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3732-
"unpin_version_before_latency_p99 - {{%s}}" % NODE_LABEL,
3733-
),
3734-
panels.target(
3735-
f"sum(irate({metric('state_store_unpin_version_before_latency_sum')}[$__rate_interval])) / sum(irate({metric('state_store_unpin_version_before_latency_count')}[$__rate_interval])) > 0",
3736-
"unpin_version_before_latency_avg",
3737-
),
3738-
panels.target(
3739-
f"histogram_quantile(0.90, sum(irate({metric('state_store_unpin_version_before_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3740-
"unpin_version_before_latency_p90 - {{%s}}" % NODE_LABEL,
3741-
),
3742-
],
3743-
),
3744-
panels.timeseries_latency(
3745-
"snapshot_latency",
3746-
"",
3747-
[
3748-
panels.target(
3749-
f"histogram_quantile(0.5, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3750-
"pin_snapshot_latency_p50 - {{%s}}" % NODE_LABEL,
3751-
),
3752-
panels.target(
3753-
f"histogram_quantile(0.99, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3754-
"pin_snapshot_latency_p99 - {{%s}}" % NODE_LABEL,
3755-
),
3756-
panels.target(
3757-
f"histogram_quantile(0.9, sum(irate({metric('state_store_pin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3758-
"pin_snapshot_latencyp90 - {{%s}}" % NODE_LABEL,
3759-
),
3760-
panels.target(
3761-
f"sum(irate({metric('state_store_pin_snapshot_latency_sum')}[$__rate_interval])) / sum(irate(state_store_pin_snapshot_latency_count[$__rate_interval])) > 0",
3762-
"pin_snapshot_latency_avg",
3763-
),
3764-
panels.target(
3765-
f"histogram_quantile(0.5, sum(irate({metric('state_store_unpin_version_snapshot_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3766-
"unpin_snapshot_latency_p50 - {{%s}}" % NODE_LABEL,
3767-
),
3768-
panels.target(
3769-
f"histogram_quantile(0.99, sum(irate({metric('state_store_unpin_version_snapshot_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3770-
"unpin_snapshot_latency_p99 - {{%s}}" % NODE_LABEL,
3771-
),
3772-
panels.target(
3773-
f"sum(irate({metric('state_store_unpin_snapshot_latency_sum')}[$__rate_interval])) / sum(irate(state_store_unpin_snapshot_latency_count[$__rate_interval])) > 0",
3774-
"unpin_snapshot_latency_avg",
3775-
),
3776-
panels.target(
3777-
f"histogram_quantile(0.90, sum(irate({metric('state_store_unpin_snapshot_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3778-
"unpin_snapshot_latency_p90 - {{%s}}" % NODE_LABEL,
3779-
),
3780-
],
3781-
),
3782-
panels.timeseries_count(
3783-
"snapshot_count",
3784-
"",
3785-
[
3786-
panels.target(
3787-
f"sum(irate({metric('state_store_pin_snapshot_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
3788-
"pin_snapshot_counts - {{%s}}" % NODE_LABEL,
3789-
),
3790-
panels.target(
3791-
f"sum(irate({metric('state_store_unpin_snapshot_counts')}[$__rate_interval])) by({COMPONENT_LABEL}, {NODE_LABEL})",
3792-
"unpin_snapshot_counts - {{%s}}" % NODE_LABEL,
3793-
),
3794-
],
3795-
),
3796-
panels.timeseries_latency(
3797-
"table_latency",
3798-
"",
3799-
[
3800-
panels.target(
3801-
f"histogram_quantile(0.5, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3802-
"get_new_sst_ids_latency_latency_p50 - {{%s}}" % NODE_LABEL,
3803-
),
3804-
panels.target(
3805-
f"histogram_quantile(0.99, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3806-
"get_new_sst_ids_latency_latency_p99 - {{%s}}" % NODE_LABEL,
3807-
),
3808-
panels.target(
3809-
f"sum(irate({metric('state_store_get_new_sst_ids_latency_sum')}[$__rate_interval])) / sum(irate({metric('state_store_get_new_sst_ids_latency_count')}[$__rate_interval])) > 0",
3810-
"get_new_sst_ids_latency_latency_avg",
3811-
),
3812-
panels.target(
3813-
f"histogram_quantile(0.90, sum(irate({metric('state_store_get_new_sst_ids_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3814-
"get_new_sst_ids_latency_latency_p90 - {{%s}}" % NODE_LABEL,
3815-
),
3816-
],
3817-
),
3818-
panels.timeseries_count(
3819-
"table_count",
3820-
"",
3821-
[
3822-
panels.target(
3823-
f"sum(irate({metric('state_store_get_new_sst_ids_latency_counts')}[$__rate_interval]))by({COMPONENT_LABEL}, {NODE_LABEL})",
3824-
"get_new_sst_ids_latency_counts - {{%s}}" % NODE_LABEL,
3825-
),
3826-
],
3827-
),
3828-
panels.timeseries_latency(
3829-
"compaction_latency",
3830-
"",
3831-
[
3832-
panels.target(
3833-
f"histogram_quantile(0.5, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3834-
"report_compaction_task_latency_p50 - {{%s}}" % NODE_LABEL,
3835-
),
3836-
panels.target(
3837-
f"histogram_quantile(0.99, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3838-
"report_compaction_task_latency_p99 - {{%s}}" % NODE_LABEL,
3839-
),
3840-
panels.target(
3841-
f"sum(irate({metric('state_store_report_compaction_task_latency_sum')}[$__rate_interval])) / sum(irate(state_store_report_compaction_task_latency_count[$__rate_interval])) > 0",
3842-
"report_compaction_task_latency_avg",
3843-
),
3844-
panels.target(
3845-
f"histogram_quantile(0.90, sum(irate({metric('state_store_report_compaction_task_latency_bucket')}[$__rate_interval])) by (le, {COMPONENT_LABEL}, {NODE_LABEL}))",
3846-
"report_compaction_task_latency_p90 - {{%s}}" % NODE_LABEL,
3847-
),
3848-
],
3849-
),
3850-
],
3851-
),
3852-
]
3853-
3854-
38553679
def section_kafka_metrics(outer_panels):
38563680
panels = outer_panels.sub_panel()
38573681
return [
@@ -5009,8 +4833,6 @@ def section_udf(outer_panels):
50094833
*section_grpc_meta_catalog_service(panels),
50104834
*section_grpc_meta_cluster_service(panels),
50114835
*section_grpc_meta_stream_manager(panels),
5012-
*section_grpc_meta_hummock_manager(panels),
5013-
*section_grpc_hummock_meta_client(panels),
50144836
*section_frontend(panels),
50154837
*section_memory_manager(panels),
50164838
*section_sink_metrics(panels),

grafana/risingwave-dev-dashboard.json

+1-1
Large diffs are not rendered by default.

grafana/risingwave-user-dashboard.json

+1-1
Large diffs are not rendered by default.

src/meta/src/hummock/manager/commit_epoch.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ use crate::hummock::manager::transaction::{
4141
};
4242
use crate::hummock::manager::versioning::Versioning;
4343
use crate::hummock::metrics_utils::{
44-
get_or_create_local_table_stat, trigger_local_table_stat, trigger_sst_stat,
44+
get_or_create_local_table_stat, trigger_epoch_stat, trigger_local_table_stat, trigger_sst_stat,
4545
};
4646
use crate::hummock::model::CompactionGroup;
4747
use crate::hummock::sequence::{next_compaction_group_id, next_sstable_object_id};
@@ -293,6 +293,7 @@ impl HummockManager {
293293
*compaction_group_id,
294294
);
295295
}
296+
trigger_epoch_stat(&self.metrics, &versioning.current_version);
296297

297298
drop(versioning_guard);
298299

src/meta/src/hummock/metrics_utils.rs

+21
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,27 @@ pub fn trigger_sst_stat(
311311
}
312312
}
313313

314+
pub fn trigger_epoch_stat(metrics: &MetaMetrics, version: &HummockVersion) {
315+
metrics.max_committed_epoch.set(
316+
version
317+
.state_table_info
318+
.info()
319+
.values()
320+
.map(|i| i.committed_epoch)
321+
.max()
322+
.unwrap_or(0) as _,
323+
);
324+
metrics.min_committed_epoch.set(
325+
version
326+
.state_table_info
327+
.info()
328+
.values()
329+
.map(|i| i.committed_epoch)
330+
.min()
331+
.unwrap_or(0) as _,
332+
);
333+
}
334+
314335
pub fn remove_compaction_group_in_sst_stat(
315336
metrics: &MetaMetrics,
316337
compaction_group_id: CompactionGroupId,

src/meta/src/rpc/metrics.rs

+6-13
Original file line numberDiff line numberDiff line change
@@ -95,10 +95,8 @@ pub struct MetaMetrics {
9595
// ********************************** Hummock ************************************
9696
/// Max committed epoch
9797
pub max_committed_epoch: IntGauge,
98-
/// The smallest epoch that has not been `GCed`.
99-
pub safe_epoch: IntGauge,
100-
/// The smallest epoch that is being pinned.
101-
pub min_pinned_epoch: IntGauge,
98+
/// Min committed epoch
99+
pub min_committed_epoch: IntGauge,
102100
/// The number of SSTs in each level
103101
pub level_sst_num: IntGaugeVec,
104102
/// The number of SSTs to be merged to next level in each level
@@ -309,13 +307,9 @@ impl MetaMetrics {
309307
)
310308
.unwrap();
311309

312-
let safe_epoch =
313-
register_int_gauge_with_registry!("storage_safe_epoch", "safe epoch", registry)
314-
.unwrap();
315-
316-
let min_pinned_epoch = register_int_gauge_with_registry!(
317-
"storage_min_pinned_epoch",
318-
"min pinned epoch",
310+
let min_committed_epoch = register_int_gauge_with_registry!(
311+
"storage_min_committed_epoch",
312+
"min committed epoch",
319313
registry
320314
)
321315
.unwrap();
@@ -794,8 +788,7 @@ impl MetaMetrics {
794788
recovery_latency,
795789

796790
max_committed_epoch,
797-
safe_epoch,
798-
min_pinned_epoch,
791+
min_committed_epoch,
799792
level_sst_num,
800793
level_compact_cnt,
801794
compact_frequency,

0 commit comments

Comments
 (0)