mongodb · lucian-tosa · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025 · lsierant
@@ -11,20 +11,26 @@
 
 class MongoDBCommon:
     @TRACER.start_as_current_span("wait_for")
-    def wait_for(self, fn, timeout=None, should_raise=True):
+    def wait_for(self, fn, timeout=None, should_raise=True, persist_for=1):
         if timeout is None:
             timeout = 600
         initial_timeout = timeout
 
         wait = 3
+        retries = 0
         while timeout > 0:
             try:
                 self.reload()
             except Exception as e:
                 print(f"Caught error: {e} while waiting for {fn.__name__}")
                 pass
             if fn(self):
-                return True
+                retries += 1
+                if retries == persist_for:
+                    return True
+            else:
+                retries = 0
+
             timeout -= wait
             time.sleep(wait)
 

@@ -1017,13 +1017,7 @@ def is_om_multi_cluster(self):
         return self["spec"].get("topology", "") == "MultiCluster"
 
     class StatusCommon:
-        def assert_reaches_phase(
-            self,
-            phase: Phase,
-            msg_regexp=None,
-            timeout=None,
-            ignore_errors=False,
-        ):
+        def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=None, ignore_errors=False, persist_for=1):
             intermediate_events = (
                 # This can be an intermediate error, right before we check for this secret we create it.
                 # The cluster might just be slow
@@ -1046,6 +1040,7 @@ def assert_reaches_phase(
                 ),
                 timeout,
                 should_raise=True,
+                persist_for=persist_for,
             )
             end_time = time.time()
             span = trace.get_current_span()
@@ -1110,8 +1105,8 @@ def __init__(self, ops_manager: MongoDBOpsManager):
         def assert_abandons_phase(self, phase: Phase, timeout=400):
             super().assert_abandons_phase(phase, timeout)
 
-        def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False):
-            super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors)
+        def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1000, ignore_errors=False, persist_for=1):
+            super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for)
 
         def get_phase(self) -> Optional[Phase]:
             try:
@@ -1156,8 +1151,8 @@ def __init__(self, ops_manager: MongoDBOpsManager):
         def assert_abandons_phase(self, phase: Phase, timeout=400):
             super().assert_abandons_phase(phase, timeout)
 
-        def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False):
-            super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors)
+        def assert_reaches_phase(self, phase: Phase, msg_regexp=None, timeout=1200, ignore_errors=False, persist_for=1):
+            super().assert_reaches_phase(phase, msg_regexp, timeout, ignore_errors, persist_for=persist_for)
 
         def get_phase(self) -> Optional[Phase]:
             try:

@@ -256,7 +256,7 @@ def test_scale_appdb(self, ops_manager: MongoDBOpsManager):
         # Reordering the clusters triggers a change in the state
         ops_manager["spec"]["applicationDatabase"]["clusterSpecList"] = scale_on_upgrade.cluster_spec
         ops_manager.update()
-        ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=500)
+        ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600)
         ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=250)
 
     def test_migrated_state_correctness(

@@ -104,7 +104,7 @@ def test_running(namespace: str):
         try:
             logger.debug(f"Waiting for {sc.name} to reach Running phase")
             # Once the first resource reached Running, it shouldn't take more than ~300s for the others to do so
-            sc.assert_reaches_phase(Phase.Running, timeout=900 if first_iter else 300)
+            sc.assert_reaches_phase(Phase.Running, timeout=1200 if first_iter else 300)
             succeeded_resources.append(sc.name)
             first_iter = False
             logger.info(f"{sc.name} reached Running phase")

@@ -161,8 +161,8 @@ def test_upgrade_operator(
 @mark.e2e_appdb_tls_operator_upgrade_v1_32_to_mck
 def test_om_tls_ok(ops_manager_tls: MongoDBOpsManager):
     ops_manager_tls.load()
-    ops_manager_tls.appdb_status().assert_reaches_phase(Phase.Running, timeout=900)
-    ops_manager_tls.om_status().assert_reaches_phase(Phase.Running, timeout=900)
+    ops_manager_tls.appdb_status().assert_reaches_phase(Phase.Running, timeout=900, persist_for=3)
+    ops_manager_tls.om_status().assert_reaches_phase(Phase.Running, timeout=900, persist_for=3)
     ops_manager_tls.get_om_tester().assert_healthiness()