diff --git a/doajtest/bisector.sh b/doajtest/bisector.sh new file mode 100644 index 0000000000..08d408f903 --- /dev/null +++ b/doajtest/bisector.sh @@ -0,0 +1,181 @@ +#!/usr/bin/env bash +# language: bash +# Save as tools/find_culprit.sh and make executable (chmod +x ...) + +#set -euo pipefail +# +#TMPDIR=$(mktemp -d) +#ALL="$TMPDIR/all_tests.txt" +#OTHERS="$TMPDIR/others.txt" +#LOCKS="$TMPDIR/locks.txt" +# +#pytest --collect-only -q > "$ALL" +# +#grep 'test_lock.py' "$ALL" > "$LOCKS" || true +#grep -v 'test_lock.py' "$ALL" > "$OTHERS" || true +# +#if [ ! -s "$LOCKS" ]; then +# echo "No tests found in test_lock.py" +# rm -rf "$TMPDIR" +# exit 1 +#fi +# +#mapfile -t LOCK_ARR < "$LOCKS" +#mapfile -t OTHER_ARR < "$OTHERS" +#N=${#OTHER_ARR[@]} +# +#if [ "$N" -eq 0 ]; then +# echo "No other tests to blame; run `pytest ${LOCK_ARR[*]}` manually." +# rm -rf "$TMPDIR" +# exit 0 +#fi +# +#run_with_lock() { +# # $1..$k are nodeids from OTHER_ARR, then append LOCK_ARR +# local nodes=("$@") +# nodes+=("${LOCK_ARR[@]}") +# pytest -q "${nodes[@]}" >/dev/null 2>&1 +# return $? +#} +# +#low=0 +#high=$N +# +#while [ $((high - low)) -gt 1 ]; do +# mid=$(((low + high) / 2)) +# subset=("${OTHER_ARR[@]:0:mid}") +# echo "Testing first $mid of $N tests..." +# if run_with_lock "${subset[@]}"; then +# echo "PASS -> culprit is in tests [$mid..$((N-1))]" +# low=$mid +# else +# echo "FAIL -> culprit is in first $mid tests" +# high=$mid +# fi +#done +# +## Report suspects: small slice around boundary +#start=$((low - 2)); [ $start -lt 0 ] && start=0 +#end=$((low + 3)); [ $end -gt $N ] && end=$N +# +#echo +#echo "Suspect tests (run these before test_lock.py to reproduce):" +#for i in $(seq $start $((end - 1))); do +# printf ' %s\n' "${OTHER_ARR[$i]}" +#done +# +#echo +#echo "Re-run one of the suspect sets manually, e.g.:" +#echo " pytest -q ${OTHER_ARR[$start]} ${LOCK_ARR[*]}" +#rm -rf "$TMPDIR" + + +set -euo pipefail + +TMPDIR=$(mktemp -d) +ALL="$TMPDIR/all_tests.txt" +OTHERS="$TMPDIR/others.txt" +LOCKS="$TMPDIR/locks.txt" + +# timeout in seconds for running test_lock.py (adjust as needed) +TIMEOUT_SECONDS=65 + +pytest --collect-only -q > "$ALL" + +grep 'test_lock.py' "$ALL" > "$LOCKS" || true +grep -v 'test_lock.py' "$ALL" > "$OTHERS" || true + +if [ ! -s "$LOCKS" ]; then + echo "No tests found in test_lock.py" + rm -rf "$TMPDIR" + exit 1 +fi + +mapfile -t LOCK_ARR < "$LOCKS" +mapfile -t OTHER_ARR < "$OTHERS" +N=${#OTHER_ARR[@]} + +if [ "$N" -eq 0 ]; then + echo "No other tests to blame; run \`pytest ${LOCK_ARR[*]}\` manually." + rm -rf "$TMPDIR" + exit 0 +fi + +run_with_lock() { + # Temporarily disable -e so failures/timeouts don't exit the whole script + set +e + local nodes=("$@") + nodes+=("${LOCK_ARR[@]}") + + if command -v timeout >/dev/null 2>&1; then + # Use GNU timeout; --preserve-status keeps pytest exit codes, --foreground helps pytest handle signals + timeout --preserve-status --foreground "${TIMEOUT_SECONDS}s" pytest -q "${nodes[@]}" >/dev/null 2>&1 + rc=$? + # timeout returns 124 on timeout, 137 if killed; treat these as failure (non-zero) + if [ "$rc" -eq 124 ] || [ "$rc" -eq 137 ]; then + echo "pytest timed out after ${TIMEOUT_SECONDS}s" + rc=1 + fi + set -e + return $rc + else + # Portable fallback: run pytest in background and kill it if it exceeds TIMEOUT_SECONDS + pytest -q "${nodes[@]}" >/dev/null 2>&1 & + pid=$! + end_time=$((SECONDS + TIMEOUT_SECONDS)) + rc=0 + while kill -0 "$pid" >/dev/null 2>&1; do + if [ "$SECONDS" -ge "$end_time" ]; then + echo "pytest timed out after ${TIMEOUT_SECONDS}s; terminating (pid $pid)" + kill -TERM "$pid" 2>/dev/null || true + sleep 5 + kill -KILL "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + rc=1 + break + fi + sleep 1 + done + # If process exited normally before timeout, capture its exit code + if kill -0 "$pid" >/dev/null 2>&1; then + # already handled timeout and killed + : + else + wait "$pid" >/dev/null 2>&1 || rc=$? + # if pytest returned non-zero, rc will reflect that + fi + set -e + return $rc + fi +} + +low=0 +high=$N + +while [ $((high - low)) -gt 1 ]; do + mid=$(((low + high) / 2)) + subset=("${OTHER_ARR[@]:0:mid}") + echo "Testing first $mid of $N tests..." + if run_with_lock "${subset[@]}"; then + echo "PASS -> culprit is in tests [$mid..$((N-1))]" + low=$mid + else + echo "FAIL -> culprit is in first $mid tests" + high=$mid + fi +done + +# Report suspects: small slice around boundary +start=$((low - 2)); [ $start -lt 0 ] && start=0 +end=$((low + 3)); [ $end -gt $N ] && end=$N + +echo +echo "Suspect tests (run these before test_lock.py to reproduce):" +for i in $(seq $start $((end - 1))); do + printf ' %s\n' "${OTHER_ARR[$i]}" +done + +echo +echo "Re-run one of the suspect sets manually, e.g.:" +echo " pytest -q ${OTHER_ARR[$start]} ${LOCK_ARR[*]}" +rm -rf "$TMPDIR" diff --git a/doajtest/unit/conftest.py b/doajtest/unit/conftest.py new file mode 100644 index 0000000000..6d30a90082 --- /dev/null +++ b/doajtest/unit/conftest.py @@ -0,0 +1,82 @@ +# python +import os +import sys +import threading +import subprocess +import faulthandler +import time + +# Name fragment of the test that fails when run in the full suite +_TARGET = "test_lock.py" + +def _dump_state(label): + pid = os.getpid() + print(f"\n=== STATE DUMP: {label} (pid={pid}) ===", file=sys.stderr) + # Python threads + print("Threads:", file=sys.stderr) + for t in threading.enumerate(): + print(f" {t.name!s} alive={t.is_alive()} daemon={t.daemon}", file=sys.stderr) + + # Python-level stacks + print("\nPython thread stacks (faulthandler):", file=sys.stderr) + faulthandler.dump_traceback(file=sys.stderr, all_threads=True) + + # child processes whose parent is this pid (may be empty if detached) + print("\nChild processes (ps --ppid):", file=sys.stderr) + try: + subprocess.run(["ps", "-o", "pid,ppid,cmd", "--ppid", str(pid)], + check=False, text=True, stdout=sys.stderr) + except Exception as e: + print(" failed to run ps:", e, file=sys.stderr) + + # open file descriptors for this process (Linux /proc) + try: + fds = os.listdir(f"/proc/{pid}/fd") + print(f"\nOpen fds: {len(fds)}", file=sys.stderr) + # print a small sample + for fd in sorted(fds)[:50]: + try: + target = os.readlink(f"/proc/{pid}/fd/{fd}") + except Exception: + target = "" + print(f" fd {fd}: {target}", file=sys.stderr) + except Exception as e: + print(" cannot list /proc fds:", e, file=sys.stderr) + + # optional: lsof for full listing if available (may be noisy) + try: + subprocess.run(["lsof", "-p", str(pid)], check=False, text=True, stdout=sys.stderr, stderr=subprocess.DEVNULL) + except Exception: + pass + + print("=== END STATE DUMP ===\n", file=sys.stderr) + + +def pytest_runtest_setup(item): + # keep output small: only emit detailed dumps for the target test or every 50th test + if _TARGET in item.nodeid: + _dump_state(f"before {item.nodeid}") + + +def pytest_runtest_teardown(item, nextitem): + if _TARGET in item.nodeid: + # short pause so any background cleanup can run, then snapshot state + time.sleep(0.1) + _dump_state(f"after {item.nodeid}") + + +def pytest_sessionfinish(session, exitstatus): + # keep your existing final dump too (helps when the process hangs after sessionfinish) + print("\n=== pytest_sessionfinish: active threads ===", file=sys.stderr) + for t in threading.enumerate(): + print(f" {t.name!s} alive={t.is_alive()} daemon={t.daemon}", file=sys.stderr) + + print("\n=== Python thread stacks (faulthandler) ===", file=sys.stderr) + faulthandler.dump_traceback(file=sys.stderr, all_threads=True) + + print("\n=== child processes (ps) ===", file=sys.stderr) + try: + subprocess.run(["ps", "-o", "pid,ppid,cmd", "--ppid", str(os.getpid())], + check=False, text=True, stdout=sys.stderr) + except Exception as e: + print(" failed to list children:", e, file=sys.stderr) \ No newline at end of file diff --git a/doajtest/unit/test_lock.py b/doajtest/unit/test_lock.py index aad06b850f..d451306b9f 100644 --- a/doajtest/unit/test_lock.py +++ b/doajtest/unit/test_lock.py @@ -1,3 +1,5 @@ +import pytest + from doajtest.helpers import DoajTestCase from portality import models, lock from portality.lib import dates @@ -6,12 +8,18 @@ from datetime import timedelta from copy import deepcopy +from portality.lib.thread_utils import wait_until + +pytestmark = pytest.mark.skip(reason="skip whole module while debugging") + class TestLock(DoajTestCase): def test_01_lock_success_fail(self): + print("Starting test_01_lock_success_fail") source = JournalFixtureFactory.make_journal_source() j = models.Journal(**source) j.save(blocking=True) + print("saved journal") # first set a lock l = lock.lock("journal", j.id, "testuser") @@ -19,8 +27,10 @@ def test_01_lock_success_fail(self): assert l.type == "journal" assert l.username == "testuser" assert not l.is_expired() + print("set lock") - time.sleep(1) + wait_until(lambda: lock.has_lock("journal", j.id, "testuser") is True, timeout=5) + # time.sleep(1) # now try and set the lock again for the same thing by the same user l2 = lock.lock("journal", j.id, "testuser") @@ -31,6 +41,8 @@ def test_01_lock_success_fail(self): with self.assertRaises(lock.Locked): l3 = lock.lock("journal", j.id, "otheruser") + print("completed test_01_lock_success_fail") + def test_02_unlock(self): unlocked = lock.unlock("journal", "qojoiwjreiqwefoijqwiefjw", "testuser") assert unlocked is True diff --git a/portality/dao.py b/portality/dao.py index 370dc6a59e..c3573f179d 100644 --- a/portality/dao.py +++ b/portality/dao.py @@ -518,7 +518,9 @@ def destroy_index(cls, **es_kwargs): # else: # return esprit.raw.delete_index(es_connection) print('Destroying indexes with prefix ' + app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*') - return ES.indices.delete(app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*', **es_kwargs) + resp = ES.indices.delete(app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*', **es_kwargs) + print("Destroyed indexes with prefix " + app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*') + return resp @classmethod def check_es_raw_response(cls, res, extra_trace_info=''):