Skip to content

Commit 779775d

Browse files
authored
Snapshots existing already are ok! (#1759)
Previously, the Downstairs would consider a snapshot existing already as an error, and unfortunately the Upstairs would continuously retry in the face of these errors, leading to an indefinite retry loop if any previous flush (with snapshot) succeeded. This commit removes the SnapshotExistsAlready error variant, and returns Ok if the snapshot exists already. This is ok as long as: - job dependency lists are honoured, meaning no writes can land that would make each downstairs inconsistent - snapshot names are random enough that collisions are rare, and therefore an existing snapshot can be treated as the previous attempt succeeding Fixes #1758.
1 parent a364c08 commit 779775d

File tree

6 files changed

+585
-175
lines changed

6 files changed

+585
-175
lines changed

common/src/lib.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,9 +115,6 @@ pub enum CrucibleError {
115115
#[error("Snapshot failed! {0}")]
116116
SnapshotFailed(String),
117117

118-
#[error("Snapshot {0} exists already")]
119-
SnapshotExistsAlready(String),
120-
121118
#[error("Attempting to modify read-only region!")]
122119
ModifyingReadOnlyRegion,
123120

@@ -426,7 +423,6 @@ impl From<CrucibleError> for dropshot::HttpError {
426423
| CrucibleError::OffsetUnaligned
427424
| CrucibleError::RegionIncompatible(_)
428425
| CrucibleError::ReplaceRequestInvalid(_)
429-
| CrucibleError::SnapshotExistsAlready(_)
430426
| CrucibleError::Unsupported(_) => {
431427
dropshot::HttpError::for_bad_request(None, e.to_string())
432428
}

downstairs/src/region.rs

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -953,19 +953,34 @@ impl Region {
953953
if cfg!(feature = "zfs_snapshot") {
954954
if let Some(snapshot_details) = snapshot_details {
955955
info!(self.log, "Flush and snap request received");
956-
// Check if the path exists, return an error if it does
956+
957+
// Check if the path exists, return Ok if it does
957958
let test_path = format!(
958959
"{}/.zfs/snapshot/{}",
959960
self.dir.clone().into_os_string().into_string().unwrap(),
960961
snapshot_details.snapshot_name,
961962
);
962963

963964
if std::path::Path::new(&test_path).is_dir() {
964-
crucible_bail!(
965-
SnapshotExistsAlready,
966-
"{}",
965+
// If the snapshot exists already, then a previous call to
966+
// `flush` succeeded, and this function can return Ok. As
967+
// long as job dependency lists are honoured, then no
968+
// concurrent writes can come in, and even if the `flush`
969+
// was called multiple times (say from a saga that is
970+
// retrying a call) all downstairs should be consistent.
971+
//
972+
// This assumes that the snapshot name is random and name
973+
// collisions are either not possible or so rare that this
974+
// check should be considered the same as a previous flush
975+
// succeeding.
976+
977+
info!(
978+
self.log,
979+
"snapshot {} exists already, returning Ok",
967980
snapshot_details.snapshot_name,
968981
);
982+
983+
return Ok(());
969984
}
970985

971986
// Look up dataset name for path (this works with any path, and
@@ -999,6 +1014,13 @@ impl Region {
9991014
})?;
10001015

10011016
if !output.status.success() {
1017+
error!(
1018+
self.log,
1019+
"snapshot failed {} {}",
1020+
String::from_utf8_lossy(&output.stdout),
1021+
String::from_utf8_lossy(&output.stderr),
1022+
);
1023+
10021024
crucible_bail!(
10031025
SnapshotFailed,
10041026
"{}",
@@ -1011,6 +1033,7 @@ impl Region {
10111033
} else if snapshot_details.is_some() {
10121034
error!(self.log, "Snapshot request received on unsupported binary");
10131035
}
1036+
10141037
Ok(())
10151038
}
10161039

integration_tests/Cargo.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@ anyhow.workspace = true
1414
base64.workspace = true
1515
bytes.workspace = true
1616
crucible-client-types.workspace = true
17-
# importantly, don't use features = ["zfs_snapshot"] here, this will cause
18-
# cleanup issues!
19-
crucible-downstairs = { workspace = true, features = ["integration-tests"] }
17+
crucible-downstairs = { workspace = true, features = ["integration-tests", "zfs_snapshot"] }
2018
crucible-pantry-client.workspace = true
2119
crucible-pantry.workspace = true
2220
crucible = { workspace = true, features = ["integration-tests"] }

0 commit comments

Comments
 (0)