Skip to content

Commit fe2a1dc

Browse files
authored
Fix BackupS3BlobCorrectness race condition with backup workers (#12482)
* Fix BackupS3BlobCorrectness race condition with backup workers * Wait on backup complete state to be written and use transaction submitBackup so backup agents don't go away * Reenable BackupS3BlobCorrectness test * Break out the wait on completion as standalone actor and then strip the UsePartitionedLog=True code * Fix for flakey early test exit
1 parent 2f1c194 commit fe2a1dc

File tree

3 files changed

+61
-14
lines changed

3 files changed

+61
-14
lines changed

fdbserver/workloads/BackupS3BlobCorrectness.actor.cpp

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,12 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
324324
// Only client 0 runs backup/restore operations
325325
// Other clients just run the Cycle workload
326326
if (clientId != 0) {
327-
return Void();
327+
// Calculate expected duration from TOML parameters to prevent flakey early test exit
328+
// Backup starts at backupAfter, restore at restoreAfter, plus buffer for completion
329+
// This ensures all clients finish around the same time instead of clients 1-7
330+
// finishing at 30s while client 0 takes ~100s, which confuses the test harness
331+
double expectedDuration = backupAfter + restoreAfter + 50.0; // 50s buffer for completion
332+
return delay(expectedDuration);
328333
}
329334
return _start(cx, this);
330335
}
@@ -345,22 +350,58 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
345350
loop {
346351
bool active = wait(agent.checkActive(cx));
347352
TraceEvent("BS3BCW_AgentActivityCheck").detail("IsActive", active);
348-
state std::string statusText = wait(agent.getStatus(cx, ShowErrors::True, tag));
353+
std::string statusText = wait(agent.getStatus(cx, ShowErrors::True, tag));
349354
// S3-specific: Suppress backup status output during testing to reduce noise
350355
// puts(statusText.c_str());
351356
std::string statusJSON = wait(agent.getStatusJSON(cx, tag));
352357
// puts(statusJSON.c_str());
358+
wait(delay(2.0));
359+
}
360+
}
353361

354-
// S3-specific: Exit early when backup reaches completed state or snapshot closes
355-
// This reduces unnecessary polling for S3 metadata that may be eventually consistent
356-
if (statusText.find("\"Name\":\"Completed\"") != std::string::npos ||
357-
(statusJSON.find("\"StopAfterSnapshot\":true") != std::string::npos &&
358-
statusJSON.find("\"ExpectedProgress\":100") != std::string::npos)) {
359-
TraceEvent("BS3BCW_StatusLoopExit").detail("Reason", "CompletedOrSnapshotClosed");
360-
return Void();
362+
// Wait for a backup to become restorable, with retries
363+
// This handles cases where cluster recoveries delay snapshot completion
364+
ACTOR static Future<Void> waitForRestorable(Reference<IBackupContainer> backupContainer, int maxAttempts) {
365+
state int restorabilityCheckAttempts = 0;
366+
state bool isRestorable = false;
367+
state int64_t lastSnapshotBytes = 0;
368+
369+
while (!isRestorable && restorabilityCheckAttempts < maxAttempts) {
370+
BackupDescription desc = wait(backupContainer->describeBackup());
371+
isRestorable = desc.maxRestorableVersion.present();
372+
lastSnapshotBytes = desc.snapshotBytes;
373+
if (!isRestorable) {
374+
TraceEvent("BS3BCW_WaitingForRestorable")
375+
.detail("Attempt", restorabilityCheckAttempts)
376+
.detail("SnapshotBytes", lastSnapshotBytes);
377+
wait(delay(2.0));
378+
restorabilityCheckAttempts++;
361379
}
362-
wait(delay(2.0));
363380
}
381+
382+
// Do one final check after the loop to catch snapshots that completed
383+
// between the last check and now
384+
if (!isRestorable) {
385+
BackupDescription finalDesc = wait(backupContainer->describeBackup());
386+
isRestorable = finalDesc.maxRestorableVersion.present();
387+
lastSnapshotBytes = finalDesc.snapshotBytes;
388+
if (isRestorable) {
389+
TraceEvent("BS3BCW_BackupRestorableOnFinalCheck").detail("SnapshotBytes", lastSnapshotBytes);
390+
}
391+
}
392+
393+
if (!isRestorable) {
394+
TraceEvent(SevError, "BS3BCW_BackupNotRestorableAfterWait")
395+
.detail("Attempts", restorabilityCheckAttempts)
396+
.detail("SnapshotBytes", lastSnapshotBytes);
397+
throw restore_invalid_version();
398+
}
399+
400+
TraceEvent("BS3BCW_BackupRestorable")
401+
.detail("AttemptsNeeded", restorabilityCheckAttempts)
402+
.detail("SnapshotBytes", lastSnapshotBytes);
403+
404+
return Void();
364405
}
365406

366407
ACTOR static Future<Void> doBackup(BackupS3BlobCorrectnessWorkload* self,
@@ -444,6 +485,9 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
444485
// S3-specific: Use configurable backup URL and snapshot intervals
445486
state std::string backupContainer = self->backupURL;
446487
state Future<Void> status = statusLoop(cx, tag.toString());
488+
489+
// Testing v1 (non-partitioned) backup approach
490+
// This does not require backup workers
447491
try {
448492
wait(backupAgent->submitBackup(cx,
449493
StringRef(backupContainer),
@@ -587,7 +631,10 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
587631
state Reference<IBackupContainer> lastBackupContainer =
588632
wait(BackupConfig(logUid).backupContainer().getD(cx.getReference()));
589633

634+
// Wait for backup to become restorable if it's still in progress
590635
if (lastBackupContainer) {
636+
wait(waitForRestorable(lastBackupContainer, 30));
637+
591638
// Clear the backup ranges before restoring (unless skipDirtyRestore is true)
592639
if (!self->skipDirtyRestore) {
593640
wait(runRYWTransaction(cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {

tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ if(WITH_PYTHON)
145145
add_fdb_test(TEST_FILES fast/AtomicOpsApiCorrectness.toml)
146146
add_fdb_test(TEST_FILES fast/AutomaticIdempotency.toml)
147147
add_fdb_test(TEST_FILES fast/BackupAzureBlobCorrectness.toml IGNORE)
148-
add_fdb_test(TEST_FILES fast/BackupS3BlobCorrectness.toml IGNORE)
148+
add_fdb_test(TEST_FILES fast/BackupS3BlobCorrectness.toml)
149149
add_fdb_test(TEST_FILES fast/BackupCorrectness.toml)
150150
add_fdb_test(TEST_FILES fast/BackupCorrectnessWithEKPKeyFetchFailures.toml)
151151
add_fdb_test(TEST_FILES fast/BackupCorrectnessWithTenantDeletion.toml)

tests/fast/BackupS3BlobCorrectness.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ runConsistencyCheck = false
2121
[[test.workload]]
2222
testName = 'BackupS3BlobCorrectness'
2323
backupAfter = 10.0
24-
restoreAfter = 60.0
24+
restoreAfter = 80.0
2525
abortAndRestartAfter = 0.0
2626
stopDifferentialAfter = 0.0
2727
performRestore = true
@@ -31,12 +31,12 @@ runConsistencyCheck = false
3131

3232
[[test.workload]]
3333
testName = 'RandomClogging'
34-
testDuration = 90.0
34+
testDuration = 130.0
3535

3636
[[test.workload]]
3737
testName = 'Rollback'
3838
meanDelay = 90.0
39-
testDuration = 90.0
39+
testDuration = 130.0
4040

4141

4242

0 commit comments

Comments
 (0)