DOAJ · richard-jones · Dec 9, 2025 · Feb 5, 2026 · Feb 5, 2026
diff --git a/doajtest/bisector.sh b/doajtest/bisector.sh
@@ -0,0 +1,181 @@
+#!/usr/bin/env bash
+# language: bash
+# Save as tools/find_culprit.sh and make executable (chmod +x ...)
+
+#set -euo pipefail
+#
+#TMPDIR=$(mktemp -d)
+#ALL="$TMPDIR/all_tests.txt"
+#OTHERS="$TMPDIR/others.txt"
+#LOCKS="$TMPDIR/locks.txt"
+#
+#pytest --collect-only -q > "$ALL"
+#
+#grep 'test_lock.py' "$ALL" > "$LOCKS" || true
+#grep -v 'test_lock.py' "$ALL" > "$OTHERS" || true
+#
+#if [ ! -s "$LOCKS" ]; then
+#  echo "No tests found in test_lock.py"
+#  rm -rf "$TMPDIR"
+#  exit 1
+#fi
+#
+#mapfile -t LOCK_ARR < "$LOCKS"
+#mapfile -t OTHER_ARR < "$OTHERS"
+#N=${#OTHER_ARR[@]}
+#
+#if [ "$N" -eq 0 ]; then
+#  echo "No other tests to blame; run `pytest ${LOCK_ARR[*]}` manually."
+#  rm -rf "$TMPDIR"
+#  exit 0
+#fi
+#
+#run_with_lock() {
+#  # $1..$k are nodeids from OTHER_ARR, then append LOCK_ARR
+#  local nodes=("$@")
+#  nodes+=("${LOCK_ARR[@]}")
+#  pytest -q "${nodes[@]}" >/dev/null 2>&1
+#  return $?
+#}
+#
+#low=0
+#high=$N
+#
+#while [ $((high - low)) -gt 1 ]; do
+#  mid=$(((low + high) / 2))
+#  subset=("${OTHER_ARR[@]:0:mid}")
+#  echo "Testing first $mid of $N tests..."
+#  if run_with_lock "${subset[@]}"; then
+#    echo "PASS -> culprit is in tests [$mid..$((N-1))]"
+#    low=$mid
+#  else
+#    echo "FAIL -> culprit is in first $mid tests"
+#    high=$mid
+#  fi
+#done
+#
+## Report suspects: small slice around boundary
+#start=$((low - 2)); [ $start -lt 0 ] && start=0
+#end=$((low + 3)); [ $end -gt $N ] && end=$N
+#
+#echo
+#echo "Suspect tests (run these before test_lock.py to reproduce):"
+#for i in $(seq $start $((end - 1))); do
+#  printf '  %s\n' "${OTHER_ARR[$i]}"
+#done
+#
+#echo
+#echo "Re-run one of the suspect sets manually, e.g.:"
+#echo "  pytest -q ${OTHER_ARR[$start]} ${LOCK_ARR[*]}"
+#rm -rf "$TMPDIR"
+
+
+set -euo pipefail
+
+TMPDIR=$(mktemp -d)
+ALL="$TMPDIR/all_tests.txt"
+OTHERS="$TMPDIR/others.txt"
+LOCKS="$TMPDIR/locks.txt"
+
+# timeout in seconds for running test_lock.py (adjust as needed)
+TIMEOUT_SECONDS=65
+
+pytest --collect-only -q > "$ALL"
+
+grep 'test_lock.py' "$ALL" > "$LOCKS" || true
+grep -v 'test_lock.py' "$ALL" > "$OTHERS" || true
+
+if [ ! -s "$LOCKS" ]; then
+  echo "No tests found in test_lock.py"
+  rm -rf "$TMPDIR"
+  exit 1
+fi
+
+mapfile -t LOCK_ARR < "$LOCKS"
+mapfile -t OTHER_ARR < "$OTHERS"
+N=${#OTHER_ARR[@]}
+
+if [ "$N" -eq 0 ]; then
+  echo "No other tests to blame; run \`pytest ${LOCK_ARR[*]}\` manually."
+  rm -rf "$TMPDIR"
+  exit 0
+fi
+
+run_with_lock() {
+  # Temporarily disable -e so failures/timeouts don't exit the whole script
+  set +e
+  local nodes=("$@")
+  nodes+=("${LOCK_ARR[@]}")
+
+  if command -v timeout >/dev/null 2>&1; then
+    # Use GNU timeout; --preserve-status keeps pytest exit codes, --foreground helps pytest handle signals
+    timeout --preserve-status --foreground "${TIMEOUT_SECONDS}s" pytest -q "${nodes[@]}" >/dev/null 2>&1
+    rc=$?
+    # timeout returns 124 on timeout, 137 if killed; treat these as failure (non-zero)
+    if [ "$rc" -eq 124 ] || [ "$rc" -eq 137 ]; then
+      echo "pytest timed out after ${TIMEOUT_SECONDS}s"
+      rc=1
+    fi
+    set -e
+    return $rc
+  else
+    # Portable fallback: run pytest in background and kill it if it exceeds TIMEOUT_SECONDS
+    pytest -q "${nodes[@]}" >/dev/null 2>&1 &
+    pid=$!
+    end_time=$((SECONDS + TIMEOUT_SECONDS))
+    rc=0
+    while kill -0 "$pid" >/dev/null 2>&1; do
+      if [ "$SECONDS" -ge "$end_time" ]; then
+        echo "pytest timed out after ${TIMEOUT_SECONDS}s; terminating (pid $pid)"
+        kill -TERM "$pid" 2>/dev/null || true
+        sleep 5
+        kill -KILL "$pid" 2>/dev/null || true
+        wait "$pid" 2>/dev/null || true
+        rc=1
+        break
+      fi
+      sleep 1
+    done
+    # If process exited normally before timeout, capture its exit code
+    if kill -0 "$pid" >/dev/null 2>&1; then
+      # already handled timeout and killed
+      :
+    else
+      wait "$pid" >/dev/null 2>&1 || rc=$?
+      # if pytest returned non-zero, rc will reflect that
+    fi
+    set -e
+    return $rc
+  fi
+}
+
+low=0
+high=$N
+
+while [ $((high - low)) -gt 1 ]; do
+  mid=$(((low + high) / 2))
+  subset=("${OTHER_ARR[@]:0:mid}")
+  echo "Testing first $mid of $N tests..."
+  if run_with_lock "${subset[@]}"; then
+    echo "PASS -> culprit is in tests [$mid..$((N-1))]"
+    low=$mid
+  else
+    echo "FAIL -> culprit is in first $mid tests"
+    high=$mid
+  fi
+done
+
+# Report suspects: small slice around boundary
+start=$((low - 2)); [ $start -lt 0 ] && start=0
+end=$((low + 3)); [ $end -gt $N ] && end=$N
+
+echo
+echo "Suspect tests (run these before test_lock.py to reproduce):"
+for i in $(seq $start $((end - 1))); do
+  printf '  %s\n' "${OTHER_ARR[$i]}"
+done
+
+echo
+echo "Re-run one of the suspect sets manually, e.g.:"
+echo "  pytest -q ${OTHER_ARR[$start]} ${LOCK_ARR[*]}"
+rm -rf "$TMPDIR"
diff --git a/doajtest/unit/conftest.py b/doajtest/unit/conftest.py
@@ -0,0 +1,82 @@
+# python
+import os
+import sys
+import threading
+import subprocess
+import faulthandler
+import time
+
+# Name fragment of the test that fails when run in the full suite
+_TARGET = "test_lock.py"
+
+def _dump_state(label):
+    pid = os.getpid()
+    print(f"\n=== STATE DUMP: {label} (pid={pid}) ===", file=sys.stderr)
+    # Python threads
+    print("Threads:", file=sys.stderr)
+    for t in threading.enumerate():
+        print(f"  {t.name!s} alive={t.is_alive()} daemon={t.daemon}", file=sys.stderr)
+
+    # Python-level stacks
+    print("\nPython thread stacks (faulthandler):", file=sys.stderr)
+    faulthandler.dump_traceback(file=sys.stderr, all_threads=True)
+
+    # child processes whose parent is this pid (may be empty if detached)
+    print("\nChild processes (ps --ppid):", file=sys.stderr)
+    try:
+        subprocess.run(["ps", "-o", "pid,ppid,cmd", "--ppid", str(pid)],
+                       check=False, text=True, stdout=sys.stderr)
+    except Exception as e:
+        print("  failed to run ps:", e, file=sys.stderr)
+
+    # open file descriptors for this process (Linux /proc)
+    try:
+        fds = os.listdir(f"/proc/{pid}/fd")
+        print(f"\nOpen fds: {len(fds)}", file=sys.stderr)
+        # print a small sample
+        for fd in sorted(fds)[:50]:
+            try:
+                target = os.readlink(f"/proc/{pid}/fd/{fd}")
+            except Exception:
+                target = "<unreadable>"
+            print(f"  fd {fd}: {target}", file=sys.stderr)
+    except Exception as e:
+        print("  cannot list /proc fds:", e, file=sys.stderr)
+
+    # optional: lsof for full listing if available (may be noisy)
+    try:
+        subprocess.run(["lsof", "-p", str(pid)], check=False, text=True, stdout=sys.stderr, stderr=subprocess.DEVNULL)
+    except Exception:
+        pass
+
+    print("=== END STATE DUMP ===\n", file=sys.stderr)
+
+
+def pytest_runtest_setup(item):
+    # keep output small: only emit detailed dumps for the target test or every 50th test
+    if _TARGET in item.nodeid:
+        _dump_state(f"before {item.nodeid}")
+
+
+def pytest_runtest_teardown(item, nextitem):
+    if _TARGET in item.nodeid:
+        # short pause so any background cleanup can run, then snapshot state
+        time.sleep(0.1)
+        _dump_state(f"after {item.nodeid}")
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # keep your existing final dump too (helps when the process hangs after sessionfinish)
+    print("\n=== pytest_sessionfinish: active threads ===", file=sys.stderr)
+    for t in threading.enumerate():
+        print(f"  {t.name!s} alive={t.is_alive()} daemon={t.daemon}", file=sys.stderr)
+
+    print("\n=== Python thread stacks (faulthandler) ===", file=sys.stderr)
+    faulthandler.dump_traceback(file=sys.stderr, all_threads=True)
+
+    print("\n=== child processes (ps) ===", file=sys.stderr)
+    try:
+        subprocess.run(["ps", "-o", "pid,ppid,cmd", "--ppid", str(os.getpid())],
+                       check=False, text=True, stdout=sys.stderr)
+    except Exception as e:
+        print("  failed to list children:", e, file=sys.stderr)
diff --git a/doajtest/unit/test_lock.py b/doajtest/unit/test_lock.py
@@ -1,3 +1,5 @@
+import pytest
+
 from doajtest.helpers import DoajTestCase
 from portality import models, lock
 from portality.lib import dates
@@ -6,21 +8,29 @@
 from datetime import timedelta
 from copy import deepcopy
 
+from portality.lib.thread_utils import wait_until
+
+pytestmark = pytest.mark.skip(reason="skip whole module while debugging")
+
 class TestLock(DoajTestCase):
 
     def test_01_lock_success_fail(self):
+        print("Starting test_01_lock_success_fail")
         source = JournalFixtureFactory.make_journal_source()
         j = models.Journal(**source)
         j.save(blocking=True)
+        print("saved journal")
 
         # first set a lock
         l = lock.lock("journal", j.id, "testuser")
         assert l.about == j.id
         assert l.type == "journal"
         assert l.username == "testuser"
         assert not l.is_expired()
+        print("set lock")
 
-        time.sleep(1)
+        wait_until(lambda: lock.has_lock("journal", j.id, "testuser") is True, timeout=5)
+        # time.sleep(1)
 
         # now try and set the lock again for the same thing by the same user
         l2 = lock.lock("journal", j.id, "testuser")
@@ -31,6 +41,8 @@ def test_01_lock_success_fail(self):
         with self.assertRaises(lock.Locked):
             l3 = lock.lock("journal", j.id, "otheruser")
 
+        print("completed test_01_lock_success_fail")
+
     def test_02_unlock(self):
         unlocked = lock.unlock("journal", "qojoiwjreiqwefoijqwiefjw", "testuser")
         assert unlocked is True

diff --git a/portality/dao.py b/portality/dao.py
@@ -518,7 +518,9 @@ def destroy_index(cls, **es_kwargs):
         # else:
         #     return esprit.raw.delete_index(es_connection)
         print('Destroying indexes with prefix ' + app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*')
-        return ES.indices.delete(app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*', **es_kwargs)
+        resp = ES.indices.delete(app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*', **es_kwargs)
+        print("Destroyed indexes with prefix " + app.config['ELASTIC_SEARCH_DB_PREFIX'] + '*')
+        return resp
 
     @classmethod
     def check_es_raw_response(cls, res, extra_trace_info=''):