diff --git a/crates/walrus-service/src/node.rs b/crates/walrus-service/src/node.rs index 40b1c75d8e..e97844fc7b 100644 --- a/crates/walrus-service/src/node.rs +++ b/crates/walrus-service/src/node.rs @@ -2653,10 +2653,12 @@ impl StorageNodeInner { /// Waits until the recovery deferral for the given blob ID expires /// and cancels the recovery deferral upon timeout. + /// + /// Returns `true` when a deferral existed and the caller waited for it. pub async fn wait_until_recovery_deferral_expires( &self, blob_id: &BlobId, - ) -> anyhow::Result> { + ) -> anyhow::Result { let (until, token) = { let map = self.recovery_deferrals.read().await; if let Some((u, existing_token)) = map.get(blob_id).cloned() { @@ -2681,7 +2683,7 @@ impl StorageNodeInner { self.metrics.recovery_deferral_waiters.dec(); } - Ok(token) + Ok(until.is_some()) } /// Returns the node capability object ID. diff --git a/crates/walrus-service/src/node/blob_sync.rs b/crates/walrus-service/src/node/blob_sync.rs index e42a0772e6..fe8c3bd3dd 100644 --- a/crates/walrus-service/src/node/blob_sync.rs +++ b/crates/walrus-service/src/node/blob_sync.rs @@ -39,7 +39,14 @@ use super::{ StorageNodeInner, committee::CommitteeService, contract_service::SystemContractService, - metrics::{self, NodeMetricSet, STATUS_IN_PROGRESS, STATUS_QUEUED}, + metrics::{ + self, + LIVE_UPLOAD_DEFERRAL_OUTCOME_AVOIDED_RECOVERY, + LIVE_UPLOAD_DEFERRAL_OUTCOME_RECOVERY_NEEDED, + NodeMetricSet, + STATUS_IN_PROGRESS, + STATUS_QUEUED, + }, storage::Storage, system_events::{CompletableHandle, EventHandle}, }; @@ -418,7 +425,7 @@ impl BlobSyncHandler { }, (guard, start) = async { - synchronizer.wait_for_recovery_window().await; + let waited_for_deferral = synchronizer.wait_for_recovery_window().await; let active_start = tokio::time::Instant::now(); @@ -433,7 +440,9 @@ impl BlobSyncHandler { let decrement_guard = GaugeGuard::acquire(&in_progress_gauge); - synchronizer.run(permits.sliver_pairs).await; + synchronizer + .run(permits.sliver_pairs, waited_for_deferral) + .await; (decrement_guard, active_start) } => { @@ -545,22 +554,26 @@ impl BlobSynchronizer { &self.node.metrics } - async fn wait_for_recovery_window(&self) { + async fn wait_for_recovery_window(&self) -> bool { match self .node .wait_until_recovery_deferral_expires(&self.blob_id) .await { - Ok(_token) => { + Ok(waited_for_deferral) => { self.node.clear_recovery_deferral(&self.blob_id).await; + waited_for_deferral + } + Err(err) => { + tracing::warn!(?err, "failed to wait out recovery deferral"); + false } - Err(err) => tracing::warn!(?err, "failed to wait out recovery deferral"), } } /// Runs the synchronizer until blob sync is complete. #[tracing::instrument(skip_all)] - async fn run(self, sliver_permits: Arc) { + async fn run(self, sliver_permits: Arc, waited_for_deferral: bool) { let this = Arc::new(self); let histograms = &this.metrics().recover_blob_part_duration_seconds; @@ -578,9 +591,28 @@ impl BlobSynchronizer { tracing::debug!( "blob already stored at all owned shards for current epoch, skipping recovery" ); + if waited_for_deferral { + this.node + .metrics + .live_upload_deferral_avoided_recovery_total + .inc(); + walrus_utils::with_label!( + this.node.metrics.live_upload_deferral_outcome_total, + LIVE_UPLOAD_DEFERRAL_OUTCOME_AVOIDED_RECOVERY + ) + .inc(); + } return; } + if waited_for_deferral { + walrus_utils::with_label!( + this.node.metrics.live_upload_deferral_outcome_total, + LIVE_UPLOAD_DEFERRAL_OUTCOME_RECOVERY_NEEDED + ) + .inc(); + } + let shared_metadata = this .clone() .recover_metadata() diff --git a/crates/walrus-service/src/node/metrics.rs b/crates/walrus-service/src/node/metrics.rs index 2e37b8df11..16eb8d8b0c 100644 --- a/crates/walrus-service/src/node/metrics.rs +++ b/crates/walrus-service/src/node/metrics.rs @@ -42,6 +42,8 @@ pub(crate) const STATUS_IN_PROGRESS: &str = "in-progress"; pub(crate) const STATUS_STARTED: &str = "started"; pub(crate) const STATUS_COMPLETED: &str = "completed"; pub(crate) const STATUS_HIGHEST_FINISHED: &str = "highest_finished"; +pub(crate) const LIVE_UPLOAD_DEFERRAL_OUTCOME_AVOIDED_RECOVERY: &str = "avoided_recovery"; +pub(crate) const LIVE_UPLOAD_DEFERRAL_OUTCOME_RECOVERY_NEEDED: &str = "recovery_needed"; type U64GaugeVec = GenericGaugeVec; type U64Gauge = GenericGauge; @@ -95,6 +97,13 @@ walrus_utils::metrics::define_metric_set! { #[help = "The number of recovery tasks currently waiting for deferrals to expire"] recovery_deferral_waiters: IntGauge[], + #[help = "Total number of blob recoveries avoided because data was already present after \ + waiting for a live-upload deferral"] + live_upload_deferral_avoided_recovery_total: IntCounter[], + + #[help = "Total number of waited live-upload deferrals by outcome"] + live_upload_deferral_outcome_total: IntCounterVec["outcome"], + #[help = "Total number of sliver instances returned"] slivers_retrieved_total: IntCounterVec["sliver_type"],