Clean up after long-running whitebox crashtest (facebook#12248)

Summary: Currently, we treat the long-running whitebox_crash_test as passing. However, we were not cleaning up after ourselves when we killed the running test for running too long, which often caused out-of-space errors in subsequent tests (e.g., blackbox_crash_test after whitebox_crash_test). Unless we want to start treating these timeouts as failures and need the DB output for investigation now, we should properly clean up the tmp dir. Pull Request resolved: facebook#12248 Test Plan: ``` $> make crash_test -j ``` Reviewed By: ajkr Differential Revision: D52885342 Pulled By: jaykorean fbshipit-source-id: 7c1f2ca7cf03d0705bb14155ee44d5d7a411c132
rockset · Jan 20, 2024 · d982260 · d982260
1 parent d69628e
commit d982260
Showing 1 changed file with 23 additions and 14 deletions.
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
@@ -855,6 +855,15 @@ def exit_if_stderr_has_errors(stderr, print_stderr=True):
         print("TEST FAILED. Output has 'fail'!!!\n")
         sys.exit(2)
 
+def cleanup_after_success(dbname):
+    shutil.rmtree(dbname, True)
+    if cleanup_cmd is not None:
+        print("Running DB cleanup command - %s\n" % cleanup_cmd)
+        ret = os.system(cleanup_cmd)
+        if ret != 0:
+            print("TEST FAILED. DB cleanup returned error %d\n" % ret)
+            sys.exit(1)
+
 # This script runs and kills db_stress multiple times. It checks consistency
 # in case of unsafe crashes in RocksDB.
 def blackbox_crash_main(args, unknown_args):
@@ -910,7 +919,7 @@ def blackbox_crash_main(args, unknown_args):
     exit_if_stderr_has_errors(errs)
 
     # we need to clean up after ourselves -- only do this on test success
-    shutil.rmtree(dbname, True)
+    cleanup_after_success(dbname)
 
 
 # This python script runs db_stress multiple times. Some runs with
@@ -935,6 +944,8 @@ def whitebox_crash_main(args, unknown_args):
     kill_random_test = cmd_params["random_kill_odd"]
     kill_mode = 0
     prev_compaction_style = -1
+    succeeded = True
+    hit_timeout = False
     while time.time() < exit_time:
         if check_mode == 0:
             additional_opts = {
@@ -1056,16 +1067,16 @@ def whitebox_crash_main(args, unknown_args):
             print("Killing the run for running too long")
             break
 
-        expected = False
+        succeeded = False
         if additional_opts["kill_random_test"] is None and (retncode == 0):
             # we expect zero retncode if no kill option
-            expected = True
+            succeeded = True
         elif additional_opts["kill_random_test"] is not None and retncode <= 0:
             # When kill option is given, the test MIGHT kill itself.
             # If it does, negative retncode is expected. Otherwise 0.
-            expected = True
+            succeeded = True
 
-        if not expected:
+        if not succeeded:
             print("TEST FAILED. See kill option and exit code above!!!\n")
             sys.exit(1)
 
@@ -1075,15 +1086,7 @@ def whitebox_crash_main(args, unknown_args):
         # First half of the duration, keep doing kill test. For the next half,
         # try different modes.
         if time.time() > half_time:
-            # we need to clean up after ourselves -- only do this on test
-            # success
-            shutil.rmtree(dbname, True)
-            if cleanup_cmd is not None:
-                print("Running DB cleanup command - %s\n" % cleanup_cmd)
-                ret = os.system(cleanup_cmd)
-                if ret != 0:
-                    print("TEST FAILED. DB cleanup returned error %d\n" % ret)
-                    sys.exit(1)
+            cleanup_after_success(dbname)
             try:
                 os.mkdir(dbname)
             except OSError:
@@ -1097,6 +1100,12 @@ def whitebox_crash_main(args, unknown_args):
         time.sleep(1)  # time to stabilize after a kill
 
 
+    # If successfully finished or timed out (we currently treat timed out test as passing)
+    # Clean up after ourselves
+    if succeeded or hit_timeout:
+        cleanup_after_success(dbname)
+
+
 def main():
     global stress_cmd
     global cleanup_cmd