@@ -324,7 +324,12 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
324324 // Only client 0 runs backup/restore operations
325325 // Other clients just run the Cycle workload
326326 if (clientId != 0 ) {
327- return Void ();
327+ // Calculate expected duration from TOML parameters to prevent flakey early test exit
328+ // Backup starts at backupAfter, restore at restoreAfter, plus buffer for completion
329+ // This ensures all clients finish around the same time instead of clients 1-7
330+ // finishing at 30s while client 0 takes ~100s, which confuses the test harness
331+ double expectedDuration = backupAfter + restoreAfter + 50.0 ; // 50s buffer for completion
332+ return delay (expectedDuration);
328333 }
329334 return _start (cx, this );
330335 }
@@ -345,22 +350,58 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
345350 loop {
346351 bool active = wait (agent.checkActive (cx));
347352 TraceEvent (" BS3BCW_AgentActivityCheck" ).detail (" IsActive" , active);
348- state std::string statusText = wait (agent.getStatus (cx, ShowErrors::True, tag));
353+ std::string statusText = wait (agent.getStatus (cx, ShowErrors::True, tag));
349354 // S3-specific: Suppress backup status output during testing to reduce noise
350355 // puts(statusText.c_str());
351356 std::string statusJSON = wait (agent.getStatusJSON (cx, tag));
352357 // puts(statusJSON.c_str());
358+ wait (delay (2.0 ));
359+ }
360+ }
353361
354- // S3-specific: Exit early when backup reaches completed state or snapshot closes
355- // This reduces unnecessary polling for S3 metadata that may be eventually consistent
356- if (statusText.find (" \" Name\" :\" Completed\" " ) != std::string::npos ||
357- (statusJSON.find (" \" StopAfterSnapshot\" :true" ) != std::string::npos &&
358- statusJSON.find (" \" ExpectedProgress\" :100" ) != std::string::npos)) {
359- TraceEvent (" BS3BCW_StatusLoopExit" ).detail (" Reason" , " CompletedOrSnapshotClosed" );
360- return Void ();
362+ // Wait for a backup to become restorable, with retries
363+ // This handles cases where cluster recoveries delay snapshot completion
364+ ACTOR static Future<Void> waitForRestorable (Reference<IBackupContainer> backupContainer, int maxAttempts) {
365+ state int restorabilityCheckAttempts = 0 ;
366+ state bool isRestorable = false ;
367+ state int64_t lastSnapshotBytes = 0 ;
368+
369+ while (!isRestorable && restorabilityCheckAttempts < maxAttempts) {
370+ BackupDescription desc = wait (backupContainer->describeBackup ());
371+ isRestorable = desc.maxRestorableVersion .present ();
372+ lastSnapshotBytes = desc.snapshotBytes ;
373+ if (!isRestorable) {
374+ TraceEvent (" BS3BCW_WaitingForRestorable" )
375+ .detail (" Attempt" , restorabilityCheckAttempts)
376+ .detail (" SnapshotBytes" , lastSnapshotBytes);
377+ wait (delay (2.0 ));
378+ restorabilityCheckAttempts++;
361379 }
362- wait (delay (2.0 ));
363380 }
381+
382+ // Do one final check after the loop to catch snapshots that completed
383+ // between the last check and now
384+ if (!isRestorable) {
385+ BackupDescription finalDesc = wait (backupContainer->describeBackup ());
386+ isRestorable = finalDesc.maxRestorableVersion .present ();
387+ lastSnapshotBytes = finalDesc.snapshotBytes ;
388+ if (isRestorable) {
389+ TraceEvent (" BS3BCW_BackupRestorableOnFinalCheck" ).detail (" SnapshotBytes" , lastSnapshotBytes);
390+ }
391+ }
392+
393+ if (!isRestorable) {
394+ TraceEvent (SevError, " BS3BCW_BackupNotRestorableAfterWait" )
395+ .detail (" Attempts" , restorabilityCheckAttempts)
396+ .detail (" SnapshotBytes" , lastSnapshotBytes);
397+ throw restore_invalid_version ();
398+ }
399+
400+ TraceEvent (" BS3BCW_BackupRestorable" )
401+ .detail (" AttemptsNeeded" , restorabilityCheckAttempts)
402+ .detail (" SnapshotBytes" , lastSnapshotBytes);
403+
404+ return Void ();
364405 }
365406
366407 ACTOR static Future<Void> doBackup (BackupS3BlobCorrectnessWorkload* self,
@@ -444,6 +485,9 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
444485 // S3-specific: Use configurable backup URL and snapshot intervals
445486 state std::string backupContainer = self->backupURL ;
446487 state Future<Void> status = statusLoop (cx, tag.toString ());
488+
489+ // Testing v1 (non-partitioned) backup approach
490+ // This does not require backup workers
447491 try {
448492 wait (backupAgent->submitBackup (cx,
449493 StringRef (backupContainer),
@@ -587,7 +631,10 @@ struct BackupS3BlobCorrectnessWorkload : TestWorkload {
587631 state Reference<IBackupContainer> lastBackupContainer =
588632 wait (BackupConfig (logUid).backupContainer ().getD (cx.getReference ()));
589633
634+ // Wait for backup to become restorable if it's still in progress
590635 if (lastBackupContainer) {
636+ wait (waitForRestorable (lastBackupContainer, 30 ));
637+
591638 // Clear the backup ranges before restoring (unless skipDirtyRestore is true)
592639 if (!self->skipDirtyRestore ) {
593640 wait (runRYWTransaction (cx, [=](Reference<ReadYourWritesTransaction> tr) -> Future<Void> {
0 commit comments