From 384d053a003995d785d54e7a676eef83eef82b8c Mon Sep 17 00:00:00 2001 From: sadhan Date: Mon, 16 Mar 2026 14:46:33 -0700 Subject: [PATCH 1/2] feat: Add metric to track if recovery deferral cancelled a recovery --- crates/walrus-service/src/node.rs | 6 ++++-- crates/walrus-service/src/node/blob_sync.rs | 24 +++++++++++++++------ crates/walrus-service/src/node/metrics.rs | 4 ++++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/crates/walrus-service/src/node.rs b/crates/walrus-service/src/node.rs index 40b1c75d8e..e97844fc7b 100644 --- a/crates/walrus-service/src/node.rs +++ b/crates/walrus-service/src/node.rs @@ -2653,10 +2653,12 @@ impl StorageNodeInner { /// Waits until the recovery deferral for the given blob ID expires /// and cancels the recovery deferral upon timeout. + /// + /// Returns `true` when a deferral existed and the caller waited for it. pub async fn wait_until_recovery_deferral_expires( &self, blob_id: &BlobId, - ) -> anyhow::Result> { + ) -> anyhow::Result { let (until, token) = { let map = self.recovery_deferrals.read().await; if let Some((u, existing_token)) = map.get(blob_id).cloned() { @@ -2681,7 +2683,7 @@ impl StorageNodeInner { self.metrics.recovery_deferral_waiters.dec(); } - Ok(token) + Ok(until.is_some()) } /// Returns the node capability object ID. diff --git a/crates/walrus-service/src/node/blob_sync.rs b/crates/walrus-service/src/node/blob_sync.rs index e42a0772e6..41665e929b 100644 --- a/crates/walrus-service/src/node/blob_sync.rs +++ b/crates/walrus-service/src/node/blob_sync.rs @@ -418,7 +418,7 @@ impl BlobSyncHandler { }, (guard, start) = async { - synchronizer.wait_for_recovery_window().await; + let waited_for_deferral = synchronizer.wait_for_recovery_window().await; let active_start = tokio::time::Instant::now(); @@ -433,7 +433,9 @@ impl BlobSyncHandler { let decrement_guard = GaugeGuard::acquire(&in_progress_gauge); - synchronizer.run(permits.sliver_pairs).await; + synchronizer + .run(permits.sliver_pairs, waited_for_deferral) + .await; (decrement_guard, active_start) } => { @@ -545,22 +547,26 @@ impl BlobSynchronizer { &self.node.metrics } - async fn wait_for_recovery_window(&self) { + async fn wait_for_recovery_window(&self) -> bool { match self .node .wait_until_recovery_deferral_expires(&self.blob_id) .await { - Ok(_token) => { + Ok(waited_for_deferral) => { self.node.clear_recovery_deferral(&self.blob_id).await; + waited_for_deferral + } + Err(err) => { + tracing::warn!(?err, "failed to wait out recovery deferral"); + false } - Err(err) => tracing::warn!(?err, "failed to wait out recovery deferral"), } } /// Runs the synchronizer until blob sync is complete. #[tracing::instrument(skip_all)] - async fn run(self, sliver_permits: Arc) { + async fn run(self, sliver_permits: Arc, waited_for_deferral: bool) { let this = Arc::new(self); let histograms = &this.metrics().recover_blob_part_duration_seconds; @@ -578,6 +584,12 @@ impl BlobSynchronizer { tracing::debug!( "blob already stored at all owned shards for current epoch, skipping recovery" ); + if waited_for_deferral { + this.node + .metrics + .live_upload_deferral_avoided_recovery_total + .inc(); + } return; } diff --git a/crates/walrus-service/src/node/metrics.rs b/crates/walrus-service/src/node/metrics.rs index 2e37b8df11..a604abd95f 100644 --- a/crates/walrus-service/src/node/metrics.rs +++ b/crates/walrus-service/src/node/metrics.rs @@ -95,6 +95,10 @@ walrus_utils::metrics::define_metric_set! { #[help = "The number of recovery tasks currently waiting for deferrals to expire"] recovery_deferral_waiters: IntGauge[], + #[help = "Total number of blob recoveries avoided because data was already present after \ + waiting for a live-upload deferral"] + live_upload_deferral_avoided_recovery_total: IntCounter[], + #[help = "Total number of sliver instances returned"] slivers_retrieved_total: IntCounterVec["sliver_type"], From 5ed8da5a92dc371c6a6838fbdc8848187a0eb4a9 Mon Sep 17 00:00:00 2001 From: sadhan Date: Mon, 16 Mar 2026 15:11:40 -0700 Subject: [PATCH 2/2] feat: Add another metric to track outcome --- crates/walrus-service/src/node/blob_sync.rs | 22 ++++++++++++++++++++- crates/walrus-service/src/node/metrics.rs | 5 +++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/crates/walrus-service/src/node/blob_sync.rs b/crates/walrus-service/src/node/blob_sync.rs index 41665e929b..fe8c3bd3dd 100644 --- a/crates/walrus-service/src/node/blob_sync.rs +++ b/crates/walrus-service/src/node/blob_sync.rs @@ -39,7 +39,14 @@ use super::{ StorageNodeInner, committee::CommitteeService, contract_service::SystemContractService, - metrics::{self, NodeMetricSet, STATUS_IN_PROGRESS, STATUS_QUEUED}, + metrics::{ + self, + LIVE_UPLOAD_DEFERRAL_OUTCOME_AVOIDED_RECOVERY, + LIVE_UPLOAD_DEFERRAL_OUTCOME_RECOVERY_NEEDED, + NodeMetricSet, + STATUS_IN_PROGRESS, + STATUS_QUEUED, + }, storage::Storage, system_events::{CompletableHandle, EventHandle}, }; @@ -589,10 +596,23 @@ impl BlobSynchronizer { .metrics .live_upload_deferral_avoided_recovery_total .inc(); + walrus_utils::with_label!( + this.node.metrics.live_upload_deferral_outcome_total, + LIVE_UPLOAD_DEFERRAL_OUTCOME_AVOIDED_RECOVERY + ) + .inc(); } return; } + if waited_for_deferral { + walrus_utils::with_label!( + this.node.metrics.live_upload_deferral_outcome_total, + LIVE_UPLOAD_DEFERRAL_OUTCOME_RECOVERY_NEEDED + ) + .inc(); + } + let shared_metadata = this .clone() .recover_metadata() diff --git a/crates/walrus-service/src/node/metrics.rs b/crates/walrus-service/src/node/metrics.rs index a604abd95f..16eb8d8b0c 100644 --- a/crates/walrus-service/src/node/metrics.rs +++ b/crates/walrus-service/src/node/metrics.rs @@ -42,6 +42,8 @@ pub(crate) const STATUS_IN_PROGRESS: &str = "in-progress"; pub(crate) const STATUS_STARTED: &str = "started"; pub(crate) const STATUS_COMPLETED: &str = "completed"; pub(crate) const STATUS_HIGHEST_FINISHED: &str = "highest_finished"; +pub(crate) const LIVE_UPLOAD_DEFERRAL_OUTCOME_AVOIDED_RECOVERY: &str = "avoided_recovery"; +pub(crate) const LIVE_UPLOAD_DEFERRAL_OUTCOME_RECOVERY_NEEDED: &str = "recovery_needed"; type U64GaugeVec = GenericGaugeVec; type U64Gauge = GenericGauge; @@ -99,6 +101,9 @@ walrus_utils::metrics::define_metric_set! { waiting for a live-upload deferral"] live_upload_deferral_avoided_recovery_total: IntCounter[], + #[help = "Total number of waited live-upload deferrals by outcome"] + live_upload_deferral_outcome_total: IntCounterVec["outcome"], + #[help = "Total number of sliver instances returned"] slivers_retrieved_total: IntCounterVec["sliver_type"],