Skip to content

Commit

Permalink
Clean up after long-running whitebox crashtest (facebook#12248)
Browse files Browse the repository at this point in the history
Summary:
Currently, we treat the long-running whitebox_crash_test as passing. However, we were not cleaning up after ourselves when we killed the running test for running too long, which often caused out-of-space errors in subsequent tests (e.g., blackbox_crash_test after whitebox_crash_test).

Unless we want to start treating these timeouts as failures and need the DB output for investigation now, we should properly clean up the tmp dir.

Pull Request resolved: facebook#12248

Test Plan:
```
$> make crash_test -j
```

Reviewed By: ajkr

Differential Revision: D52885342

Pulled By: jaykorean

fbshipit-source-id: 7c1f2ca7cf03d0705bb14155ee44d5d7a411c132
  • Loading branch information
jaykorean authored and facebook-github-bot committed Jan 20, 2024
1 parent d69628e commit d982260
Showing 1 changed file with 23 additions and 14 deletions.
37 changes: 23 additions & 14 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,15 @@ def exit_if_stderr_has_errors(stderr, print_stderr=True):
print("TEST FAILED. Output has 'fail'!!!\n")
sys.exit(2)

def cleanup_after_success(dbname):
shutil.rmtree(dbname, True)
if cleanup_cmd is not None:
print("Running DB cleanup command - %s\n" % cleanup_cmd)
ret = os.system(cleanup_cmd)
if ret != 0:
print("TEST FAILED. DB cleanup returned error %d\n" % ret)
sys.exit(1)

# This script runs and kills db_stress multiple times. It checks consistency
# in case of unsafe crashes in RocksDB.
def blackbox_crash_main(args, unknown_args):
Expand Down Expand Up @@ -910,7 +919,7 @@ def blackbox_crash_main(args, unknown_args):
exit_if_stderr_has_errors(errs)

# we need to clean up after ourselves -- only do this on test success
shutil.rmtree(dbname, True)
cleanup_after_success(dbname)


# This python script runs db_stress multiple times. Some runs with
Expand All @@ -935,6 +944,8 @@ def whitebox_crash_main(args, unknown_args):
kill_random_test = cmd_params["random_kill_odd"]
kill_mode = 0
prev_compaction_style = -1
succeeded = True
hit_timeout = False
while time.time() < exit_time:
if check_mode == 0:
additional_opts = {
Expand Down Expand Up @@ -1056,16 +1067,16 @@ def whitebox_crash_main(args, unknown_args):
print("Killing the run for running too long")
break

expected = False
succeeded = False
if additional_opts["kill_random_test"] is None and (retncode == 0):
# we expect zero retncode if no kill option
expected = True
succeeded = True
elif additional_opts["kill_random_test"] is not None and retncode <= 0:
# When kill option is given, the test MIGHT kill itself.
# If it does, negative retncode is expected. Otherwise 0.
expected = True
succeeded = True

if not expected:
if not succeeded:
print("TEST FAILED. See kill option and exit code above!!!\n")
sys.exit(1)

Expand All @@ -1075,15 +1086,7 @@ def whitebox_crash_main(args, unknown_args):
# First half of the duration, keep doing kill test. For the next half,
# try different modes.
if time.time() > half_time:
# we need to clean up after ourselves -- only do this on test
# success
shutil.rmtree(dbname, True)
if cleanup_cmd is not None:
print("Running DB cleanup command - %s\n" % cleanup_cmd)
ret = os.system(cleanup_cmd)
if ret != 0:
print("TEST FAILED. DB cleanup returned error %d\n" % ret)
sys.exit(1)
cleanup_after_success(dbname)
try:
os.mkdir(dbname)
except OSError:
Expand All @@ -1097,6 +1100,12 @@ def whitebox_crash_main(args, unknown_args):
time.sleep(1) # time to stabilize after a kill


# If successfully finished or timed out (we currently treat timed out test as passing)
# Clean up after ourselves
if succeeded or hit_timeout:
cleanup_after_success(dbname)


def main():
global stress_cmd
global cleanup_cmd
Expand Down

0 comments on commit d982260

Please sign in to comment.